]>
Commit | Line | Data |
---|---|---|
42bcb36c BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
42bcb36c BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
e5dc681a | 26 | |
42bcb36c BB |
27 | |
28 | /* | |
29 | * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. | |
30 | * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) | |
31 | * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), | |
32 | * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also | |
33 | * the section 3C man pages. | |
34 | * Interface stability: Committed | |
35 | */ | |
36 | ||
37 | #include <sys/types.h> | |
38 | #ifdef _KERNEL | |
39 | #include <sys/param.h> | |
40 | #include <sys/sysmacros.h> | |
42bcb36c BB |
41 | #include <sys/debug.h> |
42 | #include <sys/kmem.h> | |
43 | #include <sys/sunddi.h> | |
44 | #else | |
45 | #include <sys/u8_textprep.h> | |
46 | #endif /* _KERNEL */ | |
47 | #include <sys/byteorder.h> | |
48 | #include <sys/errno.h> | |
49 | ||
50 | ||
51 | /* | |
52 | * The max and min values of high and low surrogate pairs of UTF-16, | |
53 | * UTF-16 bit shift value, bit mask, and starting value outside of BMP. | |
54 | */ | |
55 | #define UCONV_U16_HI_MIN (0xd800U) | |
56 | #define UCONV_U16_HI_MAX (0xdbffU) | |
57 | #define UCONV_U16_LO_MIN (0xdc00U) | |
58 | #define UCONV_U16_LO_MAX (0xdfffU) | |
59 | #define UCONV_U16_BIT_SHIFT (0x0400U) | |
60 | #define UCONV_U16_BIT_MASK (0x0fffffU) | |
61 | #define UCONV_U16_START (0x010000U) | |
62 | ||
63 | /* The maximum value of Unicode coding space and ASCII coding space. */ | |
64 | #define UCONV_UNICODE_MAX (0x10ffffU) | |
65 | #define UCONV_ASCII_MAX (0x7fU) | |
66 | ||
67 | /* The mask values for input and output endians. */ | |
68 | #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) | |
69 | #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) | |
70 | ||
71 | /* Native and reversed endian macros. */ | |
5678d3f5 | 72 | #ifdef _ZFS_BIG_ENDIAN |
42bcb36c BB |
73 | #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN |
74 | #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN | |
75 | #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN | |
76 | #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN | |
77 | #else | |
78 | #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN | |
79 | #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN | |
80 | #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN | |
81 | #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN | |
82 | #endif /* _BIG_ENDIAN */ | |
83 | ||
84 | /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ | |
85 | #define UCONV_BOM_NORMAL (0xfeffU) | |
86 | #define UCONV_BOM_SWAPPED (0xfffeU) | |
87 | #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) | |
88 | ||
89 | /* UTF-32 boundaries based on UTF-8 character byte lengths. */ | |
90 | #define UCONV_U8_ONE_BYTE (0x7fU) | |
91 | #define UCONV_U8_TWO_BYTES (0x7ffU) | |
92 | #define UCONV_U8_THREE_BYTES (0xffffU) | |
93 | #define UCONV_U8_FOUR_BYTES (0x10ffffU) | |
94 | ||
95 | /* The common minimum and maximum values at the UTF-8 character bytes. */ | |
96 | #define UCONV_U8_BYTE_MIN (0x80U) | |
97 | #define UCONV_U8_BYTE_MAX (0xbfU) | |
98 | ||
99 | /* | |
100 | * The following "6" and "0x3f" came from "10xx xxxx" bit representation of | |
101 | * UTF-8 character bytes. | |
102 | */ | |
103 | #define UCONV_U8_BIT_SHIFT 6 | |
104 | #define UCONV_U8_BIT_MASK 0x3f | |
105 | ||
106 | /* | |
107 | * The following vector shows remaining bytes in a UTF-8 character. | |
108 | * Index will be the first byte of the character. | |
109 | */ | |
110 | static const uchar_t remaining_bytes_tbl[0x100] = { | |
111 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
112 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
113 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
114 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
115 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
116 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
118 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
119 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
120 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
121 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
122 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
123 | ||
124 | /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ | |
125 | 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
126 | ||
127 | /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ | |
128 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
129 | ||
130 | /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ | |
131 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
132 | ||
133 | /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ | |
134 | 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
135 | }; | |
136 | ||
137 | /* | |
138 | * The following is a vector of bit-masks to get used bits in | |
139 | * the first byte of a UTF-8 character. Index is remaining bytes at above of | |
140 | * the character. | |
141 | */ | |
42bcb36c | 142 | static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; |
42bcb36c BB |
143 | |
144 | /* | |
145 | * The following two vectors are to provide valid minimum and | |
146 | * maximum values for the 2'nd byte of a multibyte UTF-8 character for | |
147 | * better illegal sequence checking. The index value must be the value of | |
148 | * the first byte of the UTF-8 character. | |
149 | */ | |
150 | static const uchar_t valid_min_2nd_byte[0x100] = { | |
151 | 0, 0, 0, 0, 0, 0, 0, 0, | |
152 | 0, 0, 0, 0, 0, 0, 0, 0, | |
153 | 0, 0, 0, 0, 0, 0, 0, 0, | |
154 | 0, 0, 0, 0, 0, 0, 0, 0, | |
155 | 0, 0, 0, 0, 0, 0, 0, 0, | |
156 | 0, 0, 0, 0, 0, 0, 0, 0, | |
157 | 0, 0, 0, 0, 0, 0, 0, 0, | |
158 | 0, 0, 0, 0, 0, 0, 0, 0, | |
159 | 0, 0, 0, 0, 0, 0, 0, 0, | |
160 | 0, 0, 0, 0, 0, 0, 0, 0, | |
161 | 0, 0, 0, 0, 0, 0, 0, 0, | |
162 | 0, 0, 0, 0, 0, 0, 0, 0, | |
163 | 0, 0, 0, 0, 0, 0, 0, 0, | |
164 | 0, 0, 0, 0, 0, 0, 0, 0, | |
165 | 0, 0, 0, 0, 0, 0, 0, 0, | |
166 | 0, 0, 0, 0, 0, 0, 0, 0, | |
167 | 0, 0, 0, 0, 0, 0, 0, 0, | |
168 | 0, 0, 0, 0, 0, 0, 0, 0, | |
169 | 0, 0, 0, 0, 0, 0, 0, 0, | |
170 | 0, 0, 0, 0, 0, 0, 0, 0, | |
171 | 0, 0, 0, 0, 0, 0, 0, 0, | |
172 | 0, 0, 0, 0, 0, 0, 0, 0, | |
173 | 0, 0, 0, 0, 0, 0, 0, 0, | |
174 | 0, 0, 0, 0, 0, 0, 0, 0, | |
175 | ||
176 | /* C0 C1 C2 C3 C4 C5 C6 C7 */ | |
177 | 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
178 | ||
179 | /* C8 C9 CA CB CC CD CE CF */ | |
180 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
181 | ||
182 | /* D0 D1 D2 D3 D4 D5 D6 D7 */ | |
183 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
184 | ||
185 | /* D8 D9 DA DB DC DD DE DF */ | |
186 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
187 | ||
188 | /* E0 E1 E2 E3 E4 E5 E6 E7 */ | |
189 | 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
190 | ||
191 | /* E8 E9 EA EB EC ED EE EF */ | |
192 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
193 | ||
194 | /* F0 F1 F2 F3 F4 F5 F6 F7 */ | |
195 | 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, | |
196 | ||
197 | 0, 0, 0, 0, 0, 0, 0, 0 | |
198 | }; | |
199 | ||
200 | static const uchar_t valid_max_2nd_byte[0x100] = { | |
201 | 0, 0, 0, 0, 0, 0, 0, 0, | |
202 | 0, 0, 0, 0, 0, 0, 0, 0, | |
203 | 0, 0, 0, 0, 0, 0, 0, 0, | |
204 | 0, 0, 0, 0, 0, 0, 0, 0, | |
205 | 0, 0, 0, 0, 0, 0, 0, 0, | |
206 | 0, 0, 0, 0, 0, 0, 0, 0, | |
207 | 0, 0, 0, 0, 0, 0, 0, 0, | |
208 | 0, 0, 0, 0, 0, 0, 0, 0, | |
209 | 0, 0, 0, 0, 0, 0, 0, 0, | |
210 | 0, 0, 0, 0, 0, 0, 0, 0, | |
211 | 0, 0, 0, 0, 0, 0, 0, 0, | |
212 | 0, 0, 0, 0, 0, 0, 0, 0, | |
213 | 0, 0, 0, 0, 0, 0, 0, 0, | |
214 | 0, 0, 0, 0, 0, 0, 0, 0, | |
215 | 0, 0, 0, 0, 0, 0, 0, 0, | |
216 | 0, 0, 0, 0, 0, 0, 0, 0, | |
217 | 0, 0, 0, 0, 0, 0, 0, 0, | |
218 | 0, 0, 0, 0, 0, 0, 0, 0, | |
219 | 0, 0, 0, 0, 0, 0, 0, 0, | |
220 | 0, 0, 0, 0, 0, 0, 0, 0, | |
221 | 0, 0, 0, 0, 0, 0, 0, 0, | |
222 | 0, 0, 0, 0, 0, 0, 0, 0, | |
223 | 0, 0, 0, 0, 0, 0, 0, 0, | |
224 | 0, 0, 0, 0, 0, 0, 0, 0, | |
225 | ||
226 | /* C0 C1 C2 C3 C4 C5 C6 C7 */ | |
227 | 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
228 | ||
229 | /* C8 C9 CA CB CC CD CE CF */ | |
230 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
231 | ||
232 | /* D0 D1 D2 D3 D4 D5 D6 D7 */ | |
233 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
234 | ||
235 | /* D8 D9 DA DB DC DD DE DF */ | |
236 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
237 | ||
238 | /* E0 E1 E2 E3 E4 E5 E6 E7 */ | |
239 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
240 | ||
241 | /* E8 E9 EA EB EC ED EE EF */ | |
242 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, | |
243 | ||
244 | /* F0 F1 F2 F3 F4 F5 F6 F7 */ | |
245 | 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, | |
246 | ||
247 | 0, 0, 0, 0, 0, 0, 0, 0 | |
248 | }; | |
249 | ||
250 | ||
251 | static int | |
252 | check_endian(int flag, int *in, int *out) | |
253 | { | |
254 | *in = flag & UCONV_IN_ENDIAN_MASKS; | |
255 | ||
256 | /* You cannot have both. */ | |
257 | if (*in == UCONV_IN_ENDIAN_MASKS) | |
258 | return (EBADF); | |
259 | ||
260 | if (*in == 0) | |
261 | *in = UCONV_IN_NAT_ENDIAN; | |
262 | ||
263 | *out = flag & UCONV_OUT_ENDIAN_MASKS; | |
264 | ||
265 | /* You cannot have both. */ | |
266 | if (*out == UCONV_OUT_ENDIAN_MASKS) | |
267 | return (EBADF); | |
268 | ||
269 | if (*out == 0) | |
270 | *out = UCONV_OUT_NAT_ENDIAN; | |
271 | ||
272 | return (0); | |
273 | } | |
274 | ||
275 | static boolean_t | |
276 | check_bom16(const uint16_t *u16s, size_t u16l, int *in) | |
277 | { | |
278 | if (u16l > 0) { | |
279 | if (*u16s == UCONV_BOM_NORMAL) { | |
280 | *in = UCONV_IN_NAT_ENDIAN; | |
281 | return (B_TRUE); | |
282 | } | |
283 | if (*u16s == UCONV_BOM_SWAPPED) { | |
284 | *in = UCONV_IN_REV_ENDIAN; | |
285 | return (B_TRUE); | |
286 | } | |
287 | } | |
288 | ||
289 | return (B_FALSE); | |
290 | } | |
291 | ||
292 | static boolean_t | |
293 | check_bom32(const uint32_t *u32s, size_t u32l, int *in) | |
294 | { | |
295 | if (u32l > 0) { | |
296 | if (*u32s == UCONV_BOM_NORMAL) { | |
297 | *in = UCONV_IN_NAT_ENDIAN; | |
298 | return (B_TRUE); | |
299 | } | |
300 | if (*u32s == UCONV_BOM_SWAPPED_32) { | |
301 | *in = UCONV_IN_REV_ENDIAN; | |
302 | return (B_TRUE); | |
303 | } | |
304 | } | |
305 | ||
306 | return (B_FALSE); | |
307 | } | |
308 | ||
309 | int | |
310 | uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, | |
311 | uint32_t *u32s, size_t *utf32len, int flag) | |
312 | { | |
313 | int inendian; | |
314 | int outendian; | |
315 | size_t u16l; | |
316 | size_t u32l; | |
317 | uint32_t hi; | |
318 | uint32_t lo; | |
319 | boolean_t do_not_ignore_null; | |
320 | ||
321 | /* | |
322 | * Do preliminary validity checks on parameters and collect info on | |
323 | * endians. | |
324 | */ | |
325 | if (u16s == NULL || utf16len == NULL) | |
326 | return (EILSEQ); | |
327 | ||
328 | if (u32s == NULL || utf32len == NULL) | |
329 | return (E2BIG); | |
330 | ||
331 | if (check_endian(flag, &inendian, &outendian) != 0) | |
332 | return (EBADF); | |
333 | ||
334 | /* | |
335 | * Initialize input and output parameter buffer indices and | |
336 | * temporary variables. | |
337 | */ | |
338 | u16l = u32l = 0; | |
339 | hi = 0; | |
340 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
341 | ||
342 | /* | |
343 | * Check on the BOM at the beginning of the input buffer if required | |
344 | * and if there is indeed one, process it. | |
345 | */ | |
346 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
347 | check_bom16(u16s, *utf16len, &inendian)) | |
348 | u16l++; | |
349 | ||
350 | /* | |
351 | * Reset inendian and outendian so that after this point, those can be | |
352 | * used as condition values. | |
353 | */ | |
354 | inendian &= UCONV_IN_NAT_ENDIAN; | |
355 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
356 | ||
357 | /* | |
358 | * If there is something in the input buffer and if necessary and | |
359 | * requested, save the BOM at the output buffer. | |
360 | */ | |
361 | if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
362 | u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : | |
363 | UCONV_BOM_SWAPPED_32; | |
364 | ||
365 | /* | |
366 | * Do conversion; if encounter a surrogate pair, assemble high and | |
367 | * low pair values to form a UTF-32 character. If a half of a pair | |
368 | * exists alone, then, either it is an illegal (EILSEQ) or | |
369 | * invalid (EINVAL) value. | |
370 | */ | |
371 | for (; u16l < *utf16len; u16l++) { | |
372 | if (u16s[u16l] == 0 && do_not_ignore_null) | |
373 | break; | |
374 | ||
375 | lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); | |
376 | ||
377 | if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { | |
378 | if (hi) | |
379 | return (EILSEQ); | |
380 | hi = lo; | |
381 | continue; | |
382 | } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { | |
383 | if (! hi) | |
384 | return (EILSEQ); | |
385 | lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + | |
386 | lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) | |
387 | + UCONV_U16_START; | |
388 | hi = 0; | |
389 | } else if (hi) { | |
390 | return (EILSEQ); | |
391 | } | |
392 | ||
393 | if (u32l >= *utf32len) | |
394 | return (E2BIG); | |
395 | ||
396 | u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); | |
397 | } | |
398 | ||
399 | /* | |
400 | * If high half didn't see low half, then, it's most likely the input | |
401 | * parameter is incomplete. | |
402 | */ | |
403 | if (hi) | |
404 | return (EINVAL); | |
405 | ||
406 | /* | |
407 | * Save the number of consumed and saved characters. They do not | |
408 | * include terminating NULL character (U+0000) at the end of | |
409 | * the input buffer (even when UCONV_IGNORE_NULL isn't specified and | |
410 | * the input buffer length is big enough to include the terminating | |
411 | * NULL character). | |
412 | */ | |
413 | *utf16len = u16l; | |
414 | *utf32len = u32l; | |
415 | ||
416 | return (0); | |
417 | } | |
418 | ||
419 | int | |
420 | uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, | |
421 | uchar_t *u8s, size_t *utf8len, int flag) | |
422 | { | |
423 | int inendian; | |
424 | int outendian; | |
425 | size_t u16l; | |
426 | size_t u8l; | |
427 | uint32_t hi; | |
428 | uint32_t lo; | |
429 | boolean_t do_not_ignore_null; | |
430 | ||
431 | if (u16s == NULL || utf16len == NULL) | |
432 | return (EILSEQ); | |
433 | ||
434 | if (u8s == NULL || utf8len == NULL) | |
435 | return (E2BIG); | |
436 | ||
437 | if (check_endian(flag, &inendian, &outendian) != 0) | |
438 | return (EBADF); | |
439 | ||
440 | u16l = u8l = 0; | |
441 | hi = 0; | |
442 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
443 | ||
444 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
445 | check_bom16(u16s, *utf16len, &inendian)) | |
446 | u16l++; | |
447 | ||
448 | inendian &= UCONV_IN_NAT_ENDIAN; | |
449 | ||
450 | for (; u16l < *utf16len; u16l++) { | |
451 | if (u16s[u16l] == 0 && do_not_ignore_null) | |
452 | break; | |
453 | ||
454 | lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); | |
455 | ||
456 | if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { | |
457 | if (hi) | |
458 | return (EILSEQ); | |
459 | hi = lo; | |
460 | continue; | |
461 | } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { | |
462 | if (! hi) | |
463 | return (EILSEQ); | |
464 | lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + | |
465 | lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) | |
466 | + UCONV_U16_START; | |
467 | hi = 0; | |
468 | } else if (hi) { | |
469 | return (EILSEQ); | |
470 | } | |
471 | ||
472 | /* | |
473 | * Now we convert a UTF-32 character into a UTF-8 character. | |
474 | * Unicode coding space is between U+0000 and U+10FFFF; | |
475 | * anything bigger is an illegal character. | |
476 | */ | |
477 | if (lo <= UCONV_U8_ONE_BYTE) { | |
478 | if (u8l >= *utf8len) | |
479 | return (E2BIG); | |
480 | u8s[u8l++] = (uchar_t)lo; | |
481 | } else if (lo <= UCONV_U8_TWO_BYTES) { | |
482 | if ((u8l + 1) >= *utf8len) | |
483 | return (E2BIG); | |
484 | u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); | |
485 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); | |
486 | } else if (lo <= UCONV_U8_THREE_BYTES) { | |
487 | if ((u8l + 2) >= *utf8len) | |
488 | return (E2BIG); | |
489 | u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); | |
490 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); | |
491 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); | |
492 | } else if (lo <= UCONV_U8_FOUR_BYTES) { | |
493 | if ((u8l + 3) >= *utf8len) | |
494 | return (E2BIG); | |
495 | u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); | |
496 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); | |
497 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); | |
498 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); | |
499 | } else { | |
500 | return (EILSEQ); | |
501 | } | |
502 | } | |
503 | ||
504 | if (hi) | |
505 | return (EINVAL); | |
506 | ||
507 | *utf16len = u16l; | |
508 | *utf8len = u8l; | |
509 | ||
510 | return (0); | |
511 | } | |
512 | ||
513 | int | |
514 | uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, | |
515 | uint16_t *u16s, size_t *utf16len, int flag) | |
516 | { | |
517 | int inendian; | |
518 | int outendian; | |
519 | size_t u16l; | |
520 | size_t u32l; | |
521 | uint32_t hi; | |
522 | uint32_t lo; | |
523 | boolean_t do_not_ignore_null; | |
524 | ||
525 | if (u32s == NULL || utf32len == NULL) | |
526 | return (EILSEQ); | |
527 | ||
528 | if (u16s == NULL || utf16len == NULL) | |
529 | return (E2BIG); | |
530 | ||
531 | if (check_endian(flag, &inendian, &outendian) != 0) | |
532 | return (EBADF); | |
533 | ||
534 | u16l = u32l = 0; | |
535 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
536 | ||
537 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
538 | check_bom32(u32s, *utf32len, &inendian)) | |
539 | u32l++; | |
540 | ||
541 | inendian &= UCONV_IN_NAT_ENDIAN; | |
542 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
543 | ||
544 | if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
545 | u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : | |
546 | UCONV_BOM_SWAPPED; | |
547 | ||
548 | for (; u32l < *utf32len; u32l++) { | |
549 | if (u32s[u32l] == 0 && do_not_ignore_null) | |
550 | break; | |
551 | ||
552 | hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); | |
553 | ||
554 | /* | |
555 | * Anything bigger than the Unicode coding space, i.e., | |
556 | * Unicode scalar value bigger than U+10FFFF, is an illegal | |
557 | * character. | |
558 | */ | |
559 | if (hi > UCONV_UNICODE_MAX) | |
560 | return (EILSEQ); | |
561 | ||
562 | /* | |
563 | * Anything bigger than U+FFFF must be converted into | |
564 | * a surrogate pair in UTF-16. | |
565 | */ | |
566 | if (hi >= UCONV_U16_START) { | |
567 | lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + | |
568 | UCONV_U16_LO_MIN; | |
569 | hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + | |
570 | UCONV_U16_HI_MIN; | |
571 | ||
572 | if ((u16l + 1) >= *utf16len) | |
573 | return (E2BIG); | |
574 | ||
575 | if (outendian) { | |
576 | u16s[u16l++] = (uint16_t)hi; | |
577 | u16s[u16l++] = (uint16_t)lo; | |
578 | } else { | |
579 | u16s[u16l++] = BSWAP_16(((uint16_t)hi)); | |
580 | u16s[u16l++] = BSWAP_16(((uint16_t)lo)); | |
581 | } | |
582 | } else { | |
583 | if (u16l >= *utf16len) | |
584 | return (E2BIG); | |
585 | u16s[u16l++] = (outendian) ? (uint16_t)hi : | |
586 | BSWAP_16(((uint16_t)hi)); | |
587 | } | |
588 | } | |
589 | ||
590 | *utf16len = u16l; | |
591 | *utf32len = u32l; | |
592 | ||
593 | return (0); | |
594 | } | |
595 | ||
596 | int | |
597 | uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, | |
598 | uchar_t *u8s, size_t *utf8len, int flag) | |
599 | { | |
600 | int inendian; | |
601 | int outendian; | |
602 | size_t u32l; | |
603 | size_t u8l; | |
604 | uint32_t lo; | |
605 | boolean_t do_not_ignore_null; | |
606 | ||
607 | if (u32s == NULL || utf32len == NULL) | |
608 | return (EILSEQ); | |
609 | ||
610 | if (u8s == NULL || utf8len == NULL) | |
611 | return (E2BIG); | |
612 | ||
613 | if (check_endian(flag, &inendian, &outendian) != 0) | |
614 | return (EBADF); | |
615 | ||
616 | u32l = u8l = 0; | |
617 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
618 | ||
619 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
620 | check_bom32(u32s, *utf32len, &inendian)) | |
621 | u32l++; | |
622 | ||
623 | inendian &= UCONV_IN_NAT_ENDIAN; | |
624 | ||
625 | for (; u32l < *utf32len; u32l++) { | |
626 | if (u32s[u32l] == 0 && do_not_ignore_null) | |
627 | break; | |
628 | ||
629 | lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); | |
630 | ||
631 | if (lo <= UCONV_U8_ONE_BYTE) { | |
632 | if (u8l >= *utf8len) | |
633 | return (E2BIG); | |
634 | u8s[u8l++] = (uchar_t)lo; | |
635 | } else if (lo <= UCONV_U8_TWO_BYTES) { | |
636 | if ((u8l + 1) >= *utf8len) | |
637 | return (E2BIG); | |
638 | u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); | |
639 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); | |
640 | } else if (lo <= UCONV_U8_THREE_BYTES) { | |
641 | if ((u8l + 2) >= *utf8len) | |
642 | return (E2BIG); | |
643 | u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); | |
644 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); | |
645 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); | |
646 | } else if (lo <= UCONV_U8_FOUR_BYTES) { | |
647 | if ((u8l + 3) >= *utf8len) | |
648 | return (E2BIG); | |
649 | u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); | |
650 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); | |
651 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); | |
652 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); | |
653 | } else { | |
654 | return (EILSEQ); | |
655 | } | |
656 | } | |
657 | ||
658 | *utf32len = u32l; | |
659 | *utf8len = u8l; | |
660 | ||
661 | return (0); | |
662 | } | |
663 | ||
664 | int | |
665 | uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, | |
666 | uint16_t *u16s, size_t *utf16len, int flag) | |
667 | { | |
668 | int inendian; | |
669 | int outendian; | |
670 | size_t u16l; | |
671 | size_t u8l; | |
672 | uint32_t hi; | |
673 | uint32_t lo; | |
674 | int remaining_bytes; | |
675 | int first_b; | |
676 | boolean_t do_not_ignore_null; | |
677 | ||
678 | if (u8s == NULL || utf8len == NULL) | |
679 | return (EILSEQ); | |
680 | ||
681 | if (u16s == NULL || utf16len == NULL) | |
682 | return (E2BIG); | |
683 | ||
684 | if (check_endian(flag, &inendian, &outendian) != 0) | |
685 | return (EBADF); | |
686 | ||
687 | u16l = u8l = 0; | |
688 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
689 | ||
690 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
691 | ||
692 | if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
693 | u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : | |
694 | UCONV_BOM_SWAPPED; | |
695 | ||
696 | for (; u8l < *utf8len; ) { | |
697 | if (u8s[u8l] == 0 && do_not_ignore_null) | |
698 | break; | |
699 | ||
700 | /* | |
701 | * Collect a UTF-8 character and convert it to a UTF-32 | |
702 | * character. In doing so, we screen out illegally formed | |
703 | * UTF-8 characters and treat such as illegal characters. | |
704 | * The algorithm at below also screens out anything bigger | |
705 | * than the U+10FFFF. | |
706 | * | |
707 | * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for | |
708 | * more details on the illegal values of UTF-8 character | |
709 | * bytes. | |
710 | */ | |
711 | hi = (uint32_t)u8s[u8l++]; | |
712 | ||
713 | if (hi > UCONV_ASCII_MAX) { | |
714 | if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) | |
715 | return (EILSEQ); | |
716 | ||
717 | first_b = hi; | |
718 | hi = hi & u8_masks_tbl[remaining_bytes]; | |
719 | ||
720 | for (; remaining_bytes > 0; remaining_bytes--) { | |
721 | /* | |
722 | * If we have no more bytes, the current | |
723 | * UTF-8 character is incomplete. | |
724 | */ | |
725 | if (u8l >= *utf8len) | |
726 | return (EINVAL); | |
727 | ||
728 | lo = (uint32_t)u8s[u8l++]; | |
729 | ||
730 | if (first_b) { | |
731 | if (lo < valid_min_2nd_byte[first_b] || | |
732 | lo > valid_max_2nd_byte[first_b]) | |
733 | return (EILSEQ); | |
734 | first_b = 0; | |
735 | } else if (lo < UCONV_U8_BYTE_MIN || | |
736 | lo > UCONV_U8_BYTE_MAX) { | |
737 | return (EILSEQ); | |
738 | } | |
739 | hi = (hi << UCONV_U8_BIT_SHIFT) | | |
740 | (lo & UCONV_U8_BIT_MASK); | |
741 | } | |
742 | } | |
743 | ||
744 | if (hi >= UCONV_U16_START) { | |
745 | lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + | |
746 | UCONV_U16_LO_MIN; | |
747 | hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + | |
748 | UCONV_U16_HI_MIN; | |
749 | ||
750 | if ((u16l + 1) >= *utf16len) | |
751 | return (E2BIG); | |
752 | ||
753 | if (outendian) { | |
754 | u16s[u16l++] = (uint16_t)hi; | |
755 | u16s[u16l++] = (uint16_t)lo; | |
756 | } else { | |
757 | u16s[u16l++] = BSWAP_16(((uint16_t)hi)); | |
758 | u16s[u16l++] = BSWAP_16(((uint16_t)lo)); | |
759 | } | |
760 | } else { | |
761 | if (u16l >= *utf16len) | |
762 | return (E2BIG); | |
763 | ||
764 | u16s[u16l++] = (outendian) ? (uint16_t)hi : | |
765 | BSWAP_16(((uint16_t)hi)); | |
766 | } | |
767 | } | |
768 | ||
769 | *utf16len = u16l; | |
770 | *utf8len = u8l; | |
771 | ||
772 | return (0); | |
773 | } | |
774 | ||
775 | int | |
776 | uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, | |
777 | uint32_t *u32s, size_t *utf32len, int flag) | |
778 | { | |
779 | int inendian; | |
780 | int outendian; | |
781 | size_t u32l; | |
782 | size_t u8l; | |
783 | uint32_t hi; | |
784 | uint32_t c; | |
785 | int remaining_bytes; | |
786 | int first_b; | |
787 | boolean_t do_not_ignore_null; | |
788 | ||
789 | if (u8s == NULL || utf8len == NULL) | |
790 | return (EILSEQ); | |
791 | ||
792 | if (u32s == NULL || utf32len == NULL) | |
793 | return (E2BIG); | |
794 | ||
795 | if (check_endian(flag, &inendian, &outendian) != 0) | |
796 | return (EBADF); | |
797 | ||
798 | u32l = u8l = 0; | |
799 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
800 | ||
801 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
802 | ||
803 | if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
804 | u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : | |
805 | UCONV_BOM_SWAPPED_32; | |
806 | ||
807 | for (; u8l < *utf8len; ) { | |
808 | if (u8s[u8l] == 0 && do_not_ignore_null) | |
809 | break; | |
810 | ||
811 | hi = (uint32_t)u8s[u8l++]; | |
812 | ||
813 | if (hi > UCONV_ASCII_MAX) { | |
814 | if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) | |
815 | return (EILSEQ); | |
816 | ||
817 | first_b = hi; | |
818 | hi = hi & u8_masks_tbl[remaining_bytes]; | |
819 | ||
820 | for (; remaining_bytes > 0; remaining_bytes--) { | |
821 | if (u8l >= *utf8len) | |
822 | return (EINVAL); | |
823 | ||
824 | c = (uint32_t)u8s[u8l++]; | |
825 | ||
826 | if (first_b) { | |
827 | if (c < valid_min_2nd_byte[first_b] || | |
828 | c > valid_max_2nd_byte[first_b]) | |
829 | return (EILSEQ); | |
830 | first_b = 0; | |
831 | } else if (c < UCONV_U8_BYTE_MIN || | |
832 | c > UCONV_U8_BYTE_MAX) { | |
833 | return (EILSEQ); | |
834 | } | |
835 | hi = (hi << UCONV_U8_BIT_SHIFT) | | |
836 | (c & UCONV_U8_BIT_MASK); | |
837 | } | |
838 | } | |
839 | ||
840 | if (u32l >= *utf32len) | |
841 | return (E2BIG); | |
842 | ||
843 | u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); | |
844 | } | |
845 | ||
846 | *utf32len = u32l; | |
847 | *utf8len = u8l; | |
848 | ||
849 | return (0); | |
850 | } | |
c28b2279 | 851 | |
93ce2b4c | 852 | #if defined(_KERNEL) |
c28b2279 BB |
853 | EXPORT_SYMBOL(uconv_u16tou32); |
854 | EXPORT_SYMBOL(uconv_u16tou8); | |
855 | EXPORT_SYMBOL(uconv_u32tou16); | |
856 | EXPORT_SYMBOL(uconv_u32tou8); | |
857 | EXPORT_SYMBOL(uconv_u8tou16); | |
858 | EXPORT_SYMBOL(uconv_u8tou32); | |
859 | #endif |