4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
37 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/debug.h>
43 #include <sys/sunddi.h>
45 #include <sys/u8_textprep.h>
47 #include <sys/byteorder.h>
48 #include <sys/errno.h>
52 * The max and min values of high and low surrogate pairs of UTF-16,
53 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55 #define UCONV_U16_HI_MIN (0xd800U)
56 #define UCONV_U16_HI_MAX (0xdbffU)
57 #define UCONV_U16_LO_MIN (0xdc00U)
58 #define UCONV_U16_LO_MAX (0xdfffU)
59 #define UCONV_U16_BIT_SHIFT (0x0400U)
60 #define UCONV_U16_BIT_MASK (0x0fffffU)
61 #define UCONV_U16_START (0x010000U)
63 /* The maximum value of Unicode coding space and ASCII coding space. */
64 #define UCONV_UNICODE_MAX (0x10ffffU)
65 #define UCONV_ASCII_MAX (0x7fU)
67 /* The mask values for input and output endians. */
68 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71 /* Native and reversed endian macros. */
73 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
74 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
75 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
76 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
78 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
79 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
80 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
81 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
82 #endif /* _BIG_ENDIAN */
84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85 #define UCONV_BOM_NORMAL (0xfeffU)
86 #define UCONV_BOM_SWAPPED (0xfffeU)
87 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
90 #define UCONV_U8_ONE_BYTE (0x7fU)
91 #define UCONV_U8_TWO_BYTES (0x7ffU)
92 #define UCONV_U8_THREE_BYTES (0xffffU)
93 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
95 /* The common minimum and maximum values at the UTF-8 character bytes. */
96 #define UCONV_U8_BYTE_MIN (0x80U)
97 #define UCONV_U8_BYTE_MAX (0xbfU)
100 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101 * UTF-8 character bytes.
103 #define UCONV_U8_BIT_SHIFT 6
104 #define UCONV_U8_BIT_MASK 0x3f
107 * The following vector shows remaining bytes in a UTF-8 character.
108 * Index will be the first byte of the character.
110 static const uchar_t remaining_bytes_tbl
[0x100] = {
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
125 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
134 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
138 * The following is a vector of bit-masks to get used bits in
139 * the first byte of a UTF-8 character. Index is remaining bytes at above of
143 const uchar_t u8_masks_tbl
[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 static const uchar_t u8_masks_tbl
[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
149 * The following two vectors are to provide valid minimum and
150 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
151 * better illegal sequence checking. The index value must be the value of
152 * the first byte of the UTF-8 character.
154 static const uchar_t valid_min_2nd_byte
[0x100] = {
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
180 /* C0 C1 C2 C3 C4 C5 C6 C7 */
181 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
183 /* C8 C9 CA CB CC CD CE CF */
184 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
186 /* D0 D1 D2 D3 D4 D5 D6 D7 */
187 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
189 /* D8 D9 DA DB DC DD DE DF */
190 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
192 /* E0 E1 E2 E3 E4 E5 E6 E7 */
193 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
195 /* E8 E9 EA EB EC ED EE EF */
196 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
198 /* F0 F1 F2 F3 F4 F5 F6 F7 */
199 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0
204 static const uchar_t valid_max_2nd_byte
[0x100] = {
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
230 /* C0 C1 C2 C3 C4 C5 C6 C7 */
231 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
233 /* C8 C9 CA CB CC CD CE CF */
234 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
236 /* D0 D1 D2 D3 D4 D5 D6 D7 */
237 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
239 /* D8 D9 DA DB DC DD DE DF */
240 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
242 /* E0 E1 E2 E3 E4 E5 E6 E7 */
243 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
245 /* E8 E9 EA EB EC ED EE EF */
246 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
248 /* F0 F1 F2 F3 F4 F5 F6 F7 */
249 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0
256 check_endian(int flag
, int *in
, int *out
)
258 *in
= flag
& UCONV_IN_ENDIAN_MASKS
;
260 /* You cannot have both. */
261 if (*in
== UCONV_IN_ENDIAN_MASKS
)
265 *in
= UCONV_IN_NAT_ENDIAN
;
267 *out
= flag
& UCONV_OUT_ENDIAN_MASKS
;
269 /* You cannot have both. */
270 if (*out
== UCONV_OUT_ENDIAN_MASKS
)
274 *out
= UCONV_OUT_NAT_ENDIAN
;
280 check_bom16(const uint16_t *u16s
, size_t u16l
, int *in
)
283 if (*u16s
== UCONV_BOM_NORMAL
) {
284 *in
= UCONV_IN_NAT_ENDIAN
;
287 if (*u16s
== UCONV_BOM_SWAPPED
) {
288 *in
= UCONV_IN_REV_ENDIAN
;
297 check_bom32(const uint32_t *u32s
, size_t u32l
, int *in
)
300 if (*u32s
== UCONV_BOM_NORMAL
) {
301 *in
= UCONV_IN_NAT_ENDIAN
;
304 if (*u32s
== UCONV_BOM_SWAPPED_32
) {
305 *in
= UCONV_IN_REV_ENDIAN
;
314 uconv_u16tou32(const uint16_t *u16s
, size_t *utf16len
,
315 uint32_t *u32s
, size_t *utf32len
, int flag
)
323 boolean_t do_not_ignore_null
;
326 * Do preliminary validity checks on parameters and collect info on
329 if (u16s
== NULL
|| utf16len
== NULL
)
332 if (u32s
== NULL
|| utf32len
== NULL
)
335 if (check_endian(flag
, &inendian
, &outendian
) != 0)
339 * Initialize input and output parameter buffer indices and
340 * temporary variables.
344 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
347 * Check on the BOM at the beginning of the input buffer if required
348 * and if there is indeed one, process it.
350 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
351 check_bom16(u16s
, *utf16len
, &inendian
))
355 * Reset inendian and outendian so that after this point, those can be
356 * used as condition values.
358 inendian
&= UCONV_IN_NAT_ENDIAN
;
359 outendian
&= UCONV_OUT_NAT_ENDIAN
;
362 * If there is something in the input buffer and if necessary and
363 * requested, save the BOM at the output buffer.
365 if (*utf16len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
366 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
367 UCONV_BOM_SWAPPED_32
;
370 * Do conversion; if encounter a surrogate pair, assemble high and
371 * low pair values to form a UTF-32 character. If a half of a pair
372 * exists alone, then, either it is an illegal (EILSEQ) or
373 * invalid (EINVAL) value.
375 for (; u16l
< *utf16len
; u16l
++) {
376 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
379 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
381 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
386 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
389 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
390 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
397 if (u32l
>= *utf32len
)
400 u32s
[u32l
++] = (outendian
) ? lo
: BSWAP_32(lo
);
404 * If high half didn't see low half, then, it's most likely the input
405 * parameter is incomplete.
411 * Save the number of consumed and saved characters. They do not
412 * include terminating NULL character (U+0000) at the end of
413 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
414 * the input buffer length is big enough to include the terminating
424 uconv_u16tou8(const uint16_t *u16s
, size_t *utf16len
,
425 uchar_t
*u8s
, size_t *utf8len
, int flag
)
433 boolean_t do_not_ignore_null
;
435 if (u16s
== NULL
|| utf16len
== NULL
)
438 if (u8s
== NULL
|| utf8len
== NULL
)
441 if (check_endian(flag
, &inendian
, &outendian
) != 0)
446 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
448 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
449 check_bom16(u16s
, *utf16len
, &inendian
))
452 inendian
&= UCONV_IN_NAT_ENDIAN
;
454 for (; u16l
< *utf16len
; u16l
++) {
455 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
458 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
460 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
465 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
468 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
469 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
477 * Now we convert a UTF-32 character into a UTF-8 character.
478 * Unicode coding space is between U+0000 and U+10FFFF;
479 * anything bigger is an illegal character.
481 if (lo
<= UCONV_U8_ONE_BYTE
) {
484 u8s
[u8l
++] = (uchar_t
)lo
;
485 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
486 if ((u8l
+ 1) >= *utf8len
)
488 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
489 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
490 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
491 if ((u8l
+ 2) >= *utf8len
)
493 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
494 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
495 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
496 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
497 if ((u8l
+ 3) >= *utf8len
)
499 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
500 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
501 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
502 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
518 uconv_u32tou16(const uint32_t *u32s
, size_t *utf32len
,
519 uint16_t *u16s
, size_t *utf16len
, int flag
)
527 boolean_t do_not_ignore_null
;
529 if (u32s
== NULL
|| utf32len
== NULL
)
532 if (u16s
== NULL
|| utf16len
== NULL
)
535 if (check_endian(flag
, &inendian
, &outendian
) != 0)
539 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
541 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
542 check_bom32(u32s
, *utf32len
, &inendian
))
545 inendian
&= UCONV_IN_NAT_ENDIAN
;
546 outendian
&= UCONV_OUT_NAT_ENDIAN
;
548 if (*utf32len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
549 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
552 for (; u32l
< *utf32len
; u32l
++) {
553 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
556 hi
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
559 * Anything bigger than the Unicode coding space, i.e.,
560 * Unicode scalar value bigger than U+10FFFF, is an illegal
563 if (hi
> UCONV_UNICODE_MAX
)
567 * Anything bigger than U+FFFF must be converted into
568 * a surrogate pair in UTF-16.
570 if (hi
>= UCONV_U16_START
) {
571 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
573 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
576 if ((u16l
+ 1) >= *utf16len
)
580 u16s
[u16l
++] = (uint16_t)hi
;
581 u16s
[u16l
++] = (uint16_t)lo
;
583 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
584 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
587 if (u16l
>= *utf16len
)
589 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
590 BSWAP_16(((uint16_t)hi
));
601 uconv_u32tou8(const uint32_t *u32s
, size_t *utf32len
,
602 uchar_t
*u8s
, size_t *utf8len
, int flag
)
609 boolean_t do_not_ignore_null
;
611 if (u32s
== NULL
|| utf32len
== NULL
)
614 if (u8s
== NULL
|| utf8len
== NULL
)
617 if (check_endian(flag
, &inendian
, &outendian
) != 0)
621 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
623 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
624 check_bom32(u32s
, *utf32len
, &inendian
))
627 inendian
&= UCONV_IN_NAT_ENDIAN
;
629 for (; u32l
< *utf32len
; u32l
++) {
630 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
633 lo
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
635 if (lo
<= UCONV_U8_ONE_BYTE
) {
638 u8s
[u8l
++] = (uchar_t
)lo
;
639 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
640 if ((u8l
+ 1) >= *utf8len
)
642 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
643 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
644 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
645 if ((u8l
+ 2) >= *utf8len
)
647 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
648 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
649 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
650 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
651 if ((u8l
+ 3) >= *utf8len
)
653 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
654 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
655 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
656 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
669 uconv_u8tou16(const uchar_t
*u8s
, size_t *utf8len
,
670 uint16_t *u16s
, size_t *utf16len
, int flag
)
680 boolean_t do_not_ignore_null
;
682 if (u8s
== NULL
|| utf8len
== NULL
)
685 if (u16s
== NULL
|| utf16len
== NULL
)
688 if (check_endian(flag
, &inendian
, &outendian
) != 0)
692 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
694 outendian
&= UCONV_OUT_NAT_ENDIAN
;
696 if (*utf8len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
697 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
700 for (; u8l
< *utf8len
; ) {
701 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
705 * Collect a UTF-8 character and convert it to a UTF-32
706 * character. In doing so, we screen out illegally formed
707 * UTF-8 characters and treat such as illegal characters.
708 * The algorithm at below also screens out anything bigger
711 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
712 * more details on the illegal values of UTF-8 character
715 hi
= (uint32_t)u8s
[u8l
++];
717 if (hi
> UCONV_ASCII_MAX
) {
718 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
722 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
724 for (; remaining_bytes
> 0; remaining_bytes
--) {
726 * If we have no more bytes, the current
727 * UTF-8 character is incomplete.
732 lo
= (uint32_t)u8s
[u8l
++];
735 if (lo
< valid_min_2nd_byte
[first_b
] ||
736 lo
> valid_max_2nd_byte
[first_b
])
739 } else if (lo
< UCONV_U8_BYTE_MIN
||
740 lo
> UCONV_U8_BYTE_MAX
) {
743 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
744 (lo
& UCONV_U8_BIT_MASK
);
748 if (hi
>= UCONV_U16_START
) {
749 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
751 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
754 if ((u16l
+ 1) >= *utf16len
)
758 u16s
[u16l
++] = (uint16_t)hi
;
759 u16s
[u16l
++] = (uint16_t)lo
;
761 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
762 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
765 if (u16l
>= *utf16len
)
768 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
769 BSWAP_16(((uint16_t)hi
));
780 uconv_u8tou32(const uchar_t
*u8s
, size_t *utf8len
,
781 uint32_t *u32s
, size_t *utf32len
, int flag
)
791 boolean_t do_not_ignore_null
;
793 if (u8s
== NULL
|| utf8len
== NULL
)
796 if (u32s
== NULL
|| utf32len
== NULL
)
799 if (check_endian(flag
, &inendian
, &outendian
) != 0)
803 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
805 outendian
&= UCONV_OUT_NAT_ENDIAN
;
807 if (*utf8len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
808 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
809 UCONV_BOM_SWAPPED_32
;
811 for (; u8l
< *utf8len
; ) {
812 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
815 hi
= (uint32_t)u8s
[u8l
++];
817 if (hi
> UCONV_ASCII_MAX
) {
818 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
822 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
824 for (; remaining_bytes
> 0; remaining_bytes
--) {
828 c
= (uint32_t)u8s
[u8l
++];
831 if (c
< valid_min_2nd_byte
[first_b
] ||
832 c
> valid_max_2nd_byte
[first_b
])
835 } else if (c
< UCONV_U8_BYTE_MIN
||
836 c
> UCONV_U8_BYTE_MAX
) {
839 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
840 (c
& UCONV_U8_BIT_MASK
);
844 if (u32l
>= *utf32len
)
847 u32s
[u32l
++] = (outendian
) ? hi
: BSWAP_32(hi
);
857 EXPORT_SYMBOL(uconv_u16tou32
);
858 EXPORT_SYMBOL(uconv_u16tou8
);
859 EXPORT_SYMBOL(uconv_u32tou16
);
860 EXPORT_SYMBOL(uconv_u32tou8
);
861 EXPORT_SYMBOL(uconv_u8tou16
);
862 EXPORT_SYMBOL(uconv_u8tou32
);