]>
Commit | Line | Data |
---|---|---|
42bcb36c BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
26 | #pragma ident "%Z%%M% %I% %E% SMI" | |
27 | ||
28 | /* | |
29 | * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. | |
30 | * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) | |
31 | * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), | |
32 | * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also | |
33 | * the section 3C man pages. | |
34 | * Interface stability: Committed | |
35 | */ | |
36 | ||
37 | #include <sys/types.h> | |
38 | #ifdef _KERNEL | |
39 | #include <sys/param.h> | |
40 | #include <sys/sysmacros.h> | |
41 | #include <sys/systm.h> | |
42 | #include <sys/debug.h> | |
43 | #include <sys/kmem.h> | |
44 | #include <sys/sunddi.h> | |
45 | #else | |
46 | #include <sys/u8_textprep.h> | |
47 | #endif /* _KERNEL */ | |
48 | #include <sys/byteorder.h> | |
49 | #include <sys/errno.h> | |
50 | ||
51 | ||
52 | /* | |
53 | * The max and min values of high and low surrogate pairs of UTF-16, | |
54 | * UTF-16 bit shift value, bit mask, and starting value outside of BMP. | |
55 | */ | |
56 | #define UCONV_U16_HI_MIN (0xd800U) | |
57 | #define UCONV_U16_HI_MAX (0xdbffU) | |
58 | #define UCONV_U16_LO_MIN (0xdc00U) | |
59 | #define UCONV_U16_LO_MAX (0xdfffU) | |
60 | #define UCONV_U16_BIT_SHIFT (0x0400U) | |
61 | #define UCONV_U16_BIT_MASK (0x0fffffU) | |
62 | #define UCONV_U16_START (0x010000U) | |
63 | ||
64 | /* The maximum value of Unicode coding space and ASCII coding space. */ | |
65 | #define UCONV_UNICODE_MAX (0x10ffffU) | |
66 | #define UCONV_ASCII_MAX (0x7fU) | |
67 | ||
68 | /* The mask values for input and output endians. */ | |
69 | #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) | |
70 | #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) | |
71 | ||
72 | /* Native and reversed endian macros. */ | |
73 | #ifdef _BIG_ENDIAN | |
74 | #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN | |
75 | #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN | |
76 | #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN | |
77 | #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN | |
78 | #else | |
79 | #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN | |
80 | #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN | |
81 | #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN | |
82 | #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN | |
83 | #endif /* _BIG_ENDIAN */ | |
84 | ||
85 | /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ | |
86 | #define UCONV_BOM_NORMAL (0xfeffU) | |
87 | #define UCONV_BOM_SWAPPED (0xfffeU) | |
88 | #define UCONV_BOM_SWAPPED_32 (0xfffe0000U) | |
89 | ||
90 | /* UTF-32 boundaries based on UTF-8 character byte lengths. */ | |
91 | #define UCONV_U8_ONE_BYTE (0x7fU) | |
92 | #define UCONV_U8_TWO_BYTES (0x7ffU) | |
93 | #define UCONV_U8_THREE_BYTES (0xffffU) | |
94 | #define UCONV_U8_FOUR_BYTES (0x10ffffU) | |
95 | ||
96 | /* The common minimum and maximum values at the UTF-8 character bytes. */ | |
97 | #define UCONV_U8_BYTE_MIN (0x80U) | |
98 | #define UCONV_U8_BYTE_MAX (0xbfU) | |
99 | ||
100 | /* | |
101 | * The following "6" and "0x3f" came from "10xx xxxx" bit representation of | |
102 | * UTF-8 character bytes. | |
103 | */ | |
104 | #define UCONV_U8_BIT_SHIFT 6 | |
105 | #define UCONV_U8_BIT_MASK 0x3f | |
106 | ||
107 | /* | |
108 | * The following vector shows remaining bytes in a UTF-8 character. | |
109 | * Index will be the first byte of the character. | |
110 | */ | |
111 | static const uchar_t remaining_bytes_tbl[0x100] = { | |
112 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
113 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
114 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
115 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
116 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
118 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
119 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
120 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
121 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
122 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
123 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
124 | ||
125 | /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ | |
126 | 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
127 | ||
128 | /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ | |
129 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
130 | ||
131 | /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ | |
132 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
133 | ||
134 | /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ | |
135 | 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
136 | }; | |
137 | ||
138 | /* | |
139 | * The following is a vector of bit-masks to get used bits in | |
140 | * the first byte of a UTF-8 character. Index is remaining bytes at above of | |
141 | * the character. | |
142 | */ | |
143 | #ifdef _KERNEL | |
144 | const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; | |
145 | #else | |
146 | static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; | |
147 | #endif /* _KERNEL */ | |
148 | ||
149 | /* | |
150 | * The following two vectors are to provide valid minimum and | |
151 | * maximum values for the 2'nd byte of a multibyte UTF-8 character for | |
152 | * better illegal sequence checking. The index value must be the value of | |
153 | * the first byte of the UTF-8 character. | |
154 | */ | |
155 | static const uchar_t valid_min_2nd_byte[0x100] = { | |
156 | 0, 0, 0, 0, 0, 0, 0, 0, | |
157 | 0, 0, 0, 0, 0, 0, 0, 0, | |
158 | 0, 0, 0, 0, 0, 0, 0, 0, | |
159 | 0, 0, 0, 0, 0, 0, 0, 0, | |
160 | 0, 0, 0, 0, 0, 0, 0, 0, | |
161 | 0, 0, 0, 0, 0, 0, 0, 0, | |
162 | 0, 0, 0, 0, 0, 0, 0, 0, | |
163 | 0, 0, 0, 0, 0, 0, 0, 0, | |
164 | 0, 0, 0, 0, 0, 0, 0, 0, | |
165 | 0, 0, 0, 0, 0, 0, 0, 0, | |
166 | 0, 0, 0, 0, 0, 0, 0, 0, | |
167 | 0, 0, 0, 0, 0, 0, 0, 0, | |
168 | 0, 0, 0, 0, 0, 0, 0, 0, | |
169 | 0, 0, 0, 0, 0, 0, 0, 0, | |
170 | 0, 0, 0, 0, 0, 0, 0, 0, | |
171 | 0, 0, 0, 0, 0, 0, 0, 0, | |
172 | 0, 0, 0, 0, 0, 0, 0, 0, | |
173 | 0, 0, 0, 0, 0, 0, 0, 0, | |
174 | 0, 0, 0, 0, 0, 0, 0, 0, | |
175 | 0, 0, 0, 0, 0, 0, 0, 0, | |
176 | 0, 0, 0, 0, 0, 0, 0, 0, | |
177 | 0, 0, 0, 0, 0, 0, 0, 0, | |
178 | 0, 0, 0, 0, 0, 0, 0, 0, | |
179 | 0, 0, 0, 0, 0, 0, 0, 0, | |
180 | ||
181 | /* C0 C1 C2 C3 C4 C5 C6 C7 */ | |
182 | 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
183 | ||
184 | /* C8 C9 CA CB CC CD CE CF */ | |
185 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
186 | ||
187 | /* D0 D1 D2 D3 D4 D5 D6 D7 */ | |
188 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
189 | ||
190 | /* D8 D9 DA DB DC DD DE DF */ | |
191 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
192 | ||
193 | /* E0 E1 E2 E3 E4 E5 E6 E7 */ | |
194 | 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
195 | ||
196 | /* E8 E9 EA EB EC ED EE EF */ | |
197 | 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, | |
198 | ||
199 | /* F0 F1 F2 F3 F4 F5 F6 F7 */ | |
200 | 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, | |
201 | ||
202 | 0, 0, 0, 0, 0, 0, 0, 0 | |
203 | }; | |
204 | ||
205 | static const uchar_t valid_max_2nd_byte[0x100] = { | |
206 | 0, 0, 0, 0, 0, 0, 0, 0, | |
207 | 0, 0, 0, 0, 0, 0, 0, 0, | |
208 | 0, 0, 0, 0, 0, 0, 0, 0, | |
209 | 0, 0, 0, 0, 0, 0, 0, 0, | |
210 | 0, 0, 0, 0, 0, 0, 0, 0, | |
211 | 0, 0, 0, 0, 0, 0, 0, 0, | |
212 | 0, 0, 0, 0, 0, 0, 0, 0, | |
213 | 0, 0, 0, 0, 0, 0, 0, 0, | |
214 | 0, 0, 0, 0, 0, 0, 0, 0, | |
215 | 0, 0, 0, 0, 0, 0, 0, 0, | |
216 | 0, 0, 0, 0, 0, 0, 0, 0, | |
217 | 0, 0, 0, 0, 0, 0, 0, 0, | |
218 | 0, 0, 0, 0, 0, 0, 0, 0, | |
219 | 0, 0, 0, 0, 0, 0, 0, 0, | |
220 | 0, 0, 0, 0, 0, 0, 0, 0, | |
221 | 0, 0, 0, 0, 0, 0, 0, 0, | |
222 | 0, 0, 0, 0, 0, 0, 0, 0, | |
223 | 0, 0, 0, 0, 0, 0, 0, 0, | |
224 | 0, 0, 0, 0, 0, 0, 0, 0, | |
225 | 0, 0, 0, 0, 0, 0, 0, 0, | |
226 | 0, 0, 0, 0, 0, 0, 0, 0, | |
227 | 0, 0, 0, 0, 0, 0, 0, 0, | |
228 | 0, 0, 0, 0, 0, 0, 0, 0, | |
229 | 0, 0, 0, 0, 0, 0, 0, 0, | |
230 | ||
231 | /* C0 C1 C2 C3 C4 C5 C6 C7 */ | |
232 | 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
233 | ||
234 | /* C8 C9 CA CB CC CD CE CF */ | |
235 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
236 | ||
237 | /* D0 D1 D2 D3 D4 D5 D6 D7 */ | |
238 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
239 | ||
240 | /* D8 D9 DA DB DC DD DE DF */ | |
241 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
242 | ||
243 | /* E0 E1 E2 E3 E4 E5 E6 E7 */ | |
244 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, | |
245 | ||
246 | /* E8 E9 EA EB EC ED EE EF */ | |
247 | 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, | |
248 | ||
249 | /* F0 F1 F2 F3 F4 F5 F6 F7 */ | |
250 | 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, | |
251 | ||
252 | 0, 0, 0, 0, 0, 0, 0, 0 | |
253 | }; | |
254 | ||
255 | ||
256 | static int | |
257 | check_endian(int flag, int *in, int *out) | |
258 | { | |
259 | *in = flag & UCONV_IN_ENDIAN_MASKS; | |
260 | ||
261 | /* You cannot have both. */ | |
262 | if (*in == UCONV_IN_ENDIAN_MASKS) | |
263 | return (EBADF); | |
264 | ||
265 | if (*in == 0) | |
266 | *in = UCONV_IN_NAT_ENDIAN; | |
267 | ||
268 | *out = flag & UCONV_OUT_ENDIAN_MASKS; | |
269 | ||
270 | /* You cannot have both. */ | |
271 | if (*out == UCONV_OUT_ENDIAN_MASKS) | |
272 | return (EBADF); | |
273 | ||
274 | if (*out == 0) | |
275 | *out = UCONV_OUT_NAT_ENDIAN; | |
276 | ||
277 | return (0); | |
278 | } | |
279 | ||
280 | static boolean_t | |
281 | check_bom16(const uint16_t *u16s, size_t u16l, int *in) | |
282 | { | |
283 | if (u16l > 0) { | |
284 | if (*u16s == UCONV_BOM_NORMAL) { | |
285 | *in = UCONV_IN_NAT_ENDIAN; | |
286 | return (B_TRUE); | |
287 | } | |
288 | if (*u16s == UCONV_BOM_SWAPPED) { | |
289 | *in = UCONV_IN_REV_ENDIAN; | |
290 | return (B_TRUE); | |
291 | } | |
292 | } | |
293 | ||
294 | return (B_FALSE); | |
295 | } | |
296 | ||
297 | static boolean_t | |
298 | check_bom32(const uint32_t *u32s, size_t u32l, int *in) | |
299 | { | |
300 | if (u32l > 0) { | |
301 | if (*u32s == UCONV_BOM_NORMAL) { | |
302 | *in = UCONV_IN_NAT_ENDIAN; | |
303 | return (B_TRUE); | |
304 | } | |
305 | if (*u32s == UCONV_BOM_SWAPPED_32) { | |
306 | *in = UCONV_IN_REV_ENDIAN; | |
307 | return (B_TRUE); | |
308 | } | |
309 | } | |
310 | ||
311 | return (B_FALSE); | |
312 | } | |
313 | ||
314 | int | |
315 | uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, | |
316 | uint32_t *u32s, size_t *utf32len, int flag) | |
317 | { | |
318 | int inendian; | |
319 | int outendian; | |
320 | size_t u16l; | |
321 | size_t u32l; | |
322 | uint32_t hi; | |
323 | uint32_t lo; | |
324 | boolean_t do_not_ignore_null; | |
325 | ||
326 | /* | |
327 | * Do preliminary validity checks on parameters and collect info on | |
328 | * endians. | |
329 | */ | |
330 | if (u16s == NULL || utf16len == NULL) | |
331 | return (EILSEQ); | |
332 | ||
333 | if (u32s == NULL || utf32len == NULL) | |
334 | return (E2BIG); | |
335 | ||
336 | if (check_endian(flag, &inendian, &outendian) != 0) | |
337 | return (EBADF); | |
338 | ||
339 | /* | |
340 | * Initialize input and output parameter buffer indices and | |
341 | * temporary variables. | |
342 | */ | |
343 | u16l = u32l = 0; | |
344 | hi = 0; | |
345 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
346 | ||
347 | /* | |
348 | * Check on the BOM at the beginning of the input buffer if required | |
349 | * and if there is indeed one, process it. | |
350 | */ | |
351 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
352 | check_bom16(u16s, *utf16len, &inendian)) | |
353 | u16l++; | |
354 | ||
355 | /* | |
356 | * Reset inendian and outendian so that after this point, those can be | |
357 | * used as condition values. | |
358 | */ | |
359 | inendian &= UCONV_IN_NAT_ENDIAN; | |
360 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
361 | ||
362 | /* | |
363 | * If there is something in the input buffer and if necessary and | |
364 | * requested, save the BOM at the output buffer. | |
365 | */ | |
366 | if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
367 | u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : | |
368 | UCONV_BOM_SWAPPED_32; | |
369 | ||
370 | /* | |
371 | * Do conversion; if encounter a surrogate pair, assemble high and | |
372 | * low pair values to form a UTF-32 character. If a half of a pair | |
373 | * exists alone, then, either it is an illegal (EILSEQ) or | |
374 | * invalid (EINVAL) value. | |
375 | */ | |
376 | for (; u16l < *utf16len; u16l++) { | |
377 | if (u16s[u16l] == 0 && do_not_ignore_null) | |
378 | break; | |
379 | ||
380 | lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); | |
381 | ||
382 | if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { | |
383 | if (hi) | |
384 | return (EILSEQ); | |
385 | hi = lo; | |
386 | continue; | |
387 | } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { | |
388 | if (! hi) | |
389 | return (EILSEQ); | |
390 | lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + | |
391 | lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) | |
392 | + UCONV_U16_START; | |
393 | hi = 0; | |
394 | } else if (hi) { | |
395 | return (EILSEQ); | |
396 | } | |
397 | ||
398 | if (u32l >= *utf32len) | |
399 | return (E2BIG); | |
400 | ||
401 | u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); | |
402 | } | |
403 | ||
404 | /* | |
405 | * If high half didn't see low half, then, it's most likely the input | |
406 | * parameter is incomplete. | |
407 | */ | |
408 | if (hi) | |
409 | return (EINVAL); | |
410 | ||
411 | /* | |
412 | * Save the number of consumed and saved characters. They do not | |
413 | * include terminating NULL character (U+0000) at the end of | |
414 | * the input buffer (even when UCONV_IGNORE_NULL isn't specified and | |
415 | * the input buffer length is big enough to include the terminating | |
416 | * NULL character). | |
417 | */ | |
418 | *utf16len = u16l; | |
419 | *utf32len = u32l; | |
420 | ||
421 | return (0); | |
422 | } | |
423 | ||
424 | int | |
425 | uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, | |
426 | uchar_t *u8s, size_t *utf8len, int flag) | |
427 | { | |
428 | int inendian; | |
429 | int outendian; | |
430 | size_t u16l; | |
431 | size_t u8l; | |
432 | uint32_t hi; | |
433 | uint32_t lo; | |
434 | boolean_t do_not_ignore_null; | |
435 | ||
436 | if (u16s == NULL || utf16len == NULL) | |
437 | return (EILSEQ); | |
438 | ||
439 | if (u8s == NULL || utf8len == NULL) | |
440 | return (E2BIG); | |
441 | ||
442 | if (check_endian(flag, &inendian, &outendian) != 0) | |
443 | return (EBADF); | |
444 | ||
445 | u16l = u8l = 0; | |
446 | hi = 0; | |
447 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
448 | ||
449 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
450 | check_bom16(u16s, *utf16len, &inendian)) | |
451 | u16l++; | |
452 | ||
453 | inendian &= UCONV_IN_NAT_ENDIAN; | |
454 | ||
455 | for (; u16l < *utf16len; u16l++) { | |
456 | if (u16s[u16l] == 0 && do_not_ignore_null) | |
457 | break; | |
458 | ||
459 | lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); | |
460 | ||
461 | if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { | |
462 | if (hi) | |
463 | return (EILSEQ); | |
464 | hi = lo; | |
465 | continue; | |
466 | } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { | |
467 | if (! hi) | |
468 | return (EILSEQ); | |
469 | lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + | |
470 | lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) | |
471 | + UCONV_U16_START; | |
472 | hi = 0; | |
473 | } else if (hi) { | |
474 | return (EILSEQ); | |
475 | } | |
476 | ||
477 | /* | |
478 | * Now we convert a UTF-32 character into a UTF-8 character. | |
479 | * Unicode coding space is between U+0000 and U+10FFFF; | |
480 | * anything bigger is an illegal character. | |
481 | */ | |
482 | if (lo <= UCONV_U8_ONE_BYTE) { | |
483 | if (u8l >= *utf8len) | |
484 | return (E2BIG); | |
485 | u8s[u8l++] = (uchar_t)lo; | |
486 | } else if (lo <= UCONV_U8_TWO_BYTES) { | |
487 | if ((u8l + 1) >= *utf8len) | |
488 | return (E2BIG); | |
489 | u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); | |
490 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); | |
491 | } else if (lo <= UCONV_U8_THREE_BYTES) { | |
492 | if ((u8l + 2) >= *utf8len) | |
493 | return (E2BIG); | |
494 | u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); | |
495 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); | |
496 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); | |
497 | } else if (lo <= UCONV_U8_FOUR_BYTES) { | |
498 | if ((u8l + 3) >= *utf8len) | |
499 | return (E2BIG); | |
500 | u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); | |
501 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); | |
502 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); | |
503 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); | |
504 | } else { | |
505 | return (EILSEQ); | |
506 | } | |
507 | } | |
508 | ||
509 | if (hi) | |
510 | return (EINVAL); | |
511 | ||
512 | *utf16len = u16l; | |
513 | *utf8len = u8l; | |
514 | ||
515 | return (0); | |
516 | } | |
517 | ||
518 | int | |
519 | uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, | |
520 | uint16_t *u16s, size_t *utf16len, int flag) | |
521 | { | |
522 | int inendian; | |
523 | int outendian; | |
524 | size_t u16l; | |
525 | size_t u32l; | |
526 | uint32_t hi; | |
527 | uint32_t lo; | |
528 | boolean_t do_not_ignore_null; | |
529 | ||
530 | if (u32s == NULL || utf32len == NULL) | |
531 | return (EILSEQ); | |
532 | ||
533 | if (u16s == NULL || utf16len == NULL) | |
534 | return (E2BIG); | |
535 | ||
536 | if (check_endian(flag, &inendian, &outendian) != 0) | |
537 | return (EBADF); | |
538 | ||
539 | u16l = u32l = 0; | |
540 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
541 | ||
542 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
543 | check_bom32(u32s, *utf32len, &inendian)) | |
544 | u32l++; | |
545 | ||
546 | inendian &= UCONV_IN_NAT_ENDIAN; | |
547 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
548 | ||
549 | if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
550 | u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : | |
551 | UCONV_BOM_SWAPPED; | |
552 | ||
553 | for (; u32l < *utf32len; u32l++) { | |
554 | if (u32s[u32l] == 0 && do_not_ignore_null) | |
555 | break; | |
556 | ||
557 | hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); | |
558 | ||
559 | /* | |
560 | * Anything bigger than the Unicode coding space, i.e., | |
561 | * Unicode scalar value bigger than U+10FFFF, is an illegal | |
562 | * character. | |
563 | */ | |
564 | if (hi > UCONV_UNICODE_MAX) | |
565 | return (EILSEQ); | |
566 | ||
567 | /* | |
568 | * Anything bigger than U+FFFF must be converted into | |
569 | * a surrogate pair in UTF-16. | |
570 | */ | |
571 | if (hi >= UCONV_U16_START) { | |
572 | lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + | |
573 | UCONV_U16_LO_MIN; | |
574 | hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + | |
575 | UCONV_U16_HI_MIN; | |
576 | ||
577 | if ((u16l + 1) >= *utf16len) | |
578 | return (E2BIG); | |
579 | ||
580 | if (outendian) { | |
581 | u16s[u16l++] = (uint16_t)hi; | |
582 | u16s[u16l++] = (uint16_t)lo; | |
583 | } else { | |
584 | u16s[u16l++] = BSWAP_16(((uint16_t)hi)); | |
585 | u16s[u16l++] = BSWAP_16(((uint16_t)lo)); | |
586 | } | |
587 | } else { | |
588 | if (u16l >= *utf16len) | |
589 | return (E2BIG); | |
590 | u16s[u16l++] = (outendian) ? (uint16_t)hi : | |
591 | BSWAP_16(((uint16_t)hi)); | |
592 | } | |
593 | } | |
594 | ||
595 | *utf16len = u16l; | |
596 | *utf32len = u32l; | |
597 | ||
598 | return (0); | |
599 | } | |
600 | ||
601 | int | |
602 | uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, | |
603 | uchar_t *u8s, size_t *utf8len, int flag) | |
604 | { | |
605 | int inendian; | |
606 | int outendian; | |
607 | size_t u32l; | |
608 | size_t u8l; | |
609 | uint32_t lo; | |
610 | boolean_t do_not_ignore_null; | |
611 | ||
612 | if (u32s == NULL || utf32len == NULL) | |
613 | return (EILSEQ); | |
614 | ||
615 | if (u8s == NULL || utf8len == NULL) | |
616 | return (E2BIG); | |
617 | ||
618 | if (check_endian(flag, &inendian, &outendian) != 0) | |
619 | return (EBADF); | |
620 | ||
621 | u32l = u8l = 0; | |
622 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
623 | ||
624 | if ((flag & UCONV_IN_ACCEPT_BOM) && | |
625 | check_bom32(u32s, *utf32len, &inendian)) | |
626 | u32l++; | |
627 | ||
628 | inendian &= UCONV_IN_NAT_ENDIAN; | |
629 | ||
630 | for (; u32l < *utf32len; u32l++) { | |
631 | if (u32s[u32l] == 0 && do_not_ignore_null) | |
632 | break; | |
633 | ||
634 | lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); | |
635 | ||
636 | if (lo <= UCONV_U8_ONE_BYTE) { | |
637 | if (u8l >= *utf8len) | |
638 | return (E2BIG); | |
639 | u8s[u8l++] = (uchar_t)lo; | |
640 | } else if (lo <= UCONV_U8_TWO_BYTES) { | |
641 | if ((u8l + 1) >= *utf8len) | |
642 | return (E2BIG); | |
643 | u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); | |
644 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); | |
645 | } else if (lo <= UCONV_U8_THREE_BYTES) { | |
646 | if ((u8l + 2) >= *utf8len) | |
647 | return (E2BIG); | |
648 | u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); | |
649 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); | |
650 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); | |
651 | } else if (lo <= UCONV_U8_FOUR_BYTES) { | |
652 | if ((u8l + 3) >= *utf8len) | |
653 | return (E2BIG); | |
654 | u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); | |
655 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); | |
656 | u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); | |
657 | u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); | |
658 | } else { | |
659 | return (EILSEQ); | |
660 | } | |
661 | } | |
662 | ||
663 | *utf32len = u32l; | |
664 | *utf8len = u8l; | |
665 | ||
666 | return (0); | |
667 | } | |
668 | ||
669 | int | |
670 | uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, | |
671 | uint16_t *u16s, size_t *utf16len, int flag) | |
672 | { | |
673 | int inendian; | |
674 | int outendian; | |
675 | size_t u16l; | |
676 | size_t u8l; | |
677 | uint32_t hi; | |
678 | uint32_t lo; | |
679 | int remaining_bytes; | |
680 | int first_b; | |
681 | boolean_t do_not_ignore_null; | |
682 | ||
683 | if (u8s == NULL || utf8len == NULL) | |
684 | return (EILSEQ); | |
685 | ||
686 | if (u16s == NULL || utf16len == NULL) | |
687 | return (E2BIG); | |
688 | ||
689 | if (check_endian(flag, &inendian, &outendian) != 0) | |
690 | return (EBADF); | |
691 | ||
692 | u16l = u8l = 0; | |
693 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
694 | ||
695 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
696 | ||
697 | if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
698 | u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : | |
699 | UCONV_BOM_SWAPPED; | |
700 | ||
701 | for (; u8l < *utf8len; ) { | |
702 | if (u8s[u8l] == 0 && do_not_ignore_null) | |
703 | break; | |
704 | ||
705 | /* | |
706 | * Collect a UTF-8 character and convert it to a UTF-32 | |
707 | * character. In doing so, we screen out illegally formed | |
708 | * UTF-8 characters and treat such as illegal characters. | |
709 | * The algorithm at below also screens out anything bigger | |
710 | * than the U+10FFFF. | |
711 | * | |
712 | * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for | |
713 | * more details on the illegal values of UTF-8 character | |
714 | * bytes. | |
715 | */ | |
716 | hi = (uint32_t)u8s[u8l++]; | |
717 | ||
718 | if (hi > UCONV_ASCII_MAX) { | |
719 | if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) | |
720 | return (EILSEQ); | |
721 | ||
722 | first_b = hi; | |
723 | hi = hi & u8_masks_tbl[remaining_bytes]; | |
724 | ||
725 | for (; remaining_bytes > 0; remaining_bytes--) { | |
726 | /* | |
727 | * If we have no more bytes, the current | |
728 | * UTF-8 character is incomplete. | |
729 | */ | |
730 | if (u8l >= *utf8len) | |
731 | return (EINVAL); | |
732 | ||
733 | lo = (uint32_t)u8s[u8l++]; | |
734 | ||
735 | if (first_b) { | |
736 | if (lo < valid_min_2nd_byte[first_b] || | |
737 | lo > valid_max_2nd_byte[first_b]) | |
738 | return (EILSEQ); | |
739 | first_b = 0; | |
740 | } else if (lo < UCONV_U8_BYTE_MIN || | |
741 | lo > UCONV_U8_BYTE_MAX) { | |
742 | return (EILSEQ); | |
743 | } | |
744 | hi = (hi << UCONV_U8_BIT_SHIFT) | | |
745 | (lo & UCONV_U8_BIT_MASK); | |
746 | } | |
747 | } | |
748 | ||
749 | if (hi >= UCONV_U16_START) { | |
750 | lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + | |
751 | UCONV_U16_LO_MIN; | |
752 | hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + | |
753 | UCONV_U16_HI_MIN; | |
754 | ||
755 | if ((u16l + 1) >= *utf16len) | |
756 | return (E2BIG); | |
757 | ||
758 | if (outendian) { | |
759 | u16s[u16l++] = (uint16_t)hi; | |
760 | u16s[u16l++] = (uint16_t)lo; | |
761 | } else { | |
762 | u16s[u16l++] = BSWAP_16(((uint16_t)hi)); | |
763 | u16s[u16l++] = BSWAP_16(((uint16_t)lo)); | |
764 | } | |
765 | } else { | |
766 | if (u16l >= *utf16len) | |
767 | return (E2BIG); | |
768 | ||
769 | u16s[u16l++] = (outendian) ? (uint16_t)hi : | |
770 | BSWAP_16(((uint16_t)hi)); | |
771 | } | |
772 | } | |
773 | ||
774 | *utf16len = u16l; | |
775 | *utf8len = u8l; | |
776 | ||
777 | return (0); | |
778 | } | |
779 | ||
780 | int | |
781 | uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, | |
782 | uint32_t *u32s, size_t *utf32len, int flag) | |
783 | { | |
784 | int inendian; | |
785 | int outendian; | |
786 | size_t u32l; | |
787 | size_t u8l; | |
788 | uint32_t hi; | |
789 | uint32_t c; | |
790 | int remaining_bytes; | |
791 | int first_b; | |
792 | boolean_t do_not_ignore_null; | |
793 | ||
794 | if (u8s == NULL || utf8len == NULL) | |
795 | return (EILSEQ); | |
796 | ||
797 | if (u32s == NULL || utf32len == NULL) | |
798 | return (E2BIG); | |
799 | ||
800 | if (check_endian(flag, &inendian, &outendian) != 0) | |
801 | return (EBADF); | |
802 | ||
803 | u32l = u8l = 0; | |
804 | do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); | |
805 | ||
806 | outendian &= UCONV_OUT_NAT_ENDIAN; | |
807 | ||
808 | if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) | |
809 | u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : | |
810 | UCONV_BOM_SWAPPED_32; | |
811 | ||
812 | for (; u8l < *utf8len; ) { | |
813 | if (u8s[u8l] == 0 && do_not_ignore_null) | |
814 | break; | |
815 | ||
816 | hi = (uint32_t)u8s[u8l++]; | |
817 | ||
818 | if (hi > UCONV_ASCII_MAX) { | |
819 | if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) | |
820 | return (EILSEQ); | |
821 | ||
822 | first_b = hi; | |
823 | hi = hi & u8_masks_tbl[remaining_bytes]; | |
824 | ||
825 | for (; remaining_bytes > 0; remaining_bytes--) { | |
826 | if (u8l >= *utf8len) | |
827 | return (EINVAL); | |
828 | ||
829 | c = (uint32_t)u8s[u8l++]; | |
830 | ||
831 | if (first_b) { | |
832 | if (c < valid_min_2nd_byte[first_b] || | |
833 | c > valid_max_2nd_byte[first_b]) | |
834 | return (EILSEQ); | |
835 | first_b = 0; | |
836 | } else if (c < UCONV_U8_BYTE_MIN || | |
837 | c > UCONV_U8_BYTE_MAX) { | |
838 | return (EILSEQ); | |
839 | } | |
840 | hi = (hi << UCONV_U8_BIT_SHIFT) | | |
841 | (c & UCONV_U8_BIT_MASK); | |
842 | } | |
843 | } | |
844 | ||
845 | if (u32l >= *utf32len) | |
846 | return (E2BIG); | |
847 | ||
848 | u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); | |
849 | } | |
850 | ||
851 | *utf32len = u32l; | |
852 | *utf8len = u8l; | |
853 | ||
854 | return (0); | |
855 | } |