]> git.proxmox.com Git - mirror_zfs.git/blob - module/unicode/uconv.c
Update build system and packaging
[mirror_zfs.git] / module / unicode / uconv.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27
28 /*
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
35 */
36
37 #include <sys/types.h>
38 #ifdef _KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/sunddi.h>
44 #else
45 #include <sys/u8_textprep.h>
46 #endif /* _KERNEL */
47 #include <sys/byteorder.h>
48 #include <sys/errno.h>
49
50
51 /*
52 * The max and min values of high and low surrogate pairs of UTF-16,
53 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
54 */
55 #define UCONV_U16_HI_MIN (0xd800U)
56 #define UCONV_U16_HI_MAX (0xdbffU)
57 #define UCONV_U16_LO_MIN (0xdc00U)
58 #define UCONV_U16_LO_MAX (0xdfffU)
59 #define UCONV_U16_BIT_SHIFT (0x0400U)
60 #define UCONV_U16_BIT_MASK (0x0fffffU)
61 #define UCONV_U16_START (0x010000U)
62
63 /* The maximum value of Unicode coding space and ASCII coding space. */
64 #define UCONV_UNICODE_MAX (0x10ffffU)
65 #define UCONV_ASCII_MAX (0x7fU)
66
67 /* The mask values for input and output endians. */
68 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
70
71 /* Native and reversed endian macros. */
72 #ifdef _BIG_ENDIAN
73 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
74 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
75 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
76 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
77 #else
78 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
79 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
80 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
81 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
82 #endif /* _BIG_ENDIAN */
83
84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85 #define UCONV_BOM_NORMAL (0xfeffU)
86 #define UCONV_BOM_SWAPPED (0xfffeU)
87 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
88
89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
90 #define UCONV_U8_ONE_BYTE (0x7fU)
91 #define UCONV_U8_TWO_BYTES (0x7ffU)
92 #define UCONV_U8_THREE_BYTES (0xffffU)
93 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
94
95 /* The common minimum and maximum values at the UTF-8 character bytes. */
96 #define UCONV_U8_BYTE_MIN (0x80U)
97 #define UCONV_U8_BYTE_MAX (0xbfU)
98
99 /*
100 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101 * UTF-8 character bytes.
102 */
103 #define UCONV_U8_BIT_SHIFT 6
104 #define UCONV_U8_BIT_MASK 0x3f
105
106 /*
107 * The following vector shows remaining bytes in a UTF-8 character.
108 * Index will be the first byte of the character.
109 */
110 static const uchar_t remaining_bytes_tbl[0x100] = {
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123
124 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
125 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126
127 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129
130 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132
133 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
134 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
135 };
136
137 /*
138 * The following is a vector of bit-masks to get used bits in
139 * the first byte of a UTF-8 character. Index is remaining bytes at above of
140 * the character.
141 */
142 #ifdef _KERNEL
143 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144 #else
145 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
146 #endif /* _KERNEL */
147
148 /*
149 * The following two vectors are to provide valid minimum and
150 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
151 * better illegal sequence checking. The index value must be the value of
152 * the first byte of the UTF-8 character.
153 */
154 static const uchar_t valid_min_2nd_byte[0x100] = {
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179
180 /* C0 C1 C2 C3 C4 C5 C6 C7 */
181 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
182
183 /* C8 C9 CA CB CC CD CE CF */
184 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185
186 /* D0 D1 D2 D3 D4 D5 D6 D7 */
187 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
188
189 /* D8 D9 DA DB DC DD DE DF */
190 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191
192 /* E0 E1 E2 E3 E4 E5 E6 E7 */
193 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
194
195 /* E8 E9 EA EB EC ED EE EF */
196 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
197
198 /* F0 F1 F2 F3 F4 F5 F6 F7 */
199 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
200
201 0, 0, 0, 0, 0, 0, 0, 0
202 };
203
204 static const uchar_t valid_max_2nd_byte[0x100] = {
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229
230 /* C0 C1 C2 C3 C4 C5 C6 C7 */
231 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
232
233 /* C8 C9 CA CB CC CD CE CF */
234 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
235
236 /* D0 D1 D2 D3 D4 D5 D6 D7 */
237 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
238
239 /* D8 D9 DA DB DC DD DE DF */
240 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
241
242 /* E0 E1 E2 E3 E4 E5 E6 E7 */
243 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
244
245 /* E8 E9 EA EB EC ED EE EF */
246 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
247
248 /* F0 F1 F2 F3 F4 F5 F6 F7 */
249 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
250
251 0, 0, 0, 0, 0, 0, 0, 0
252 };
253
254
255 static int
256 check_endian(int flag, int *in, int *out)
257 {
258 *in = flag & UCONV_IN_ENDIAN_MASKS;
259
260 /* You cannot have both. */
261 if (*in == UCONV_IN_ENDIAN_MASKS)
262 return (EBADF);
263
264 if (*in == 0)
265 *in = UCONV_IN_NAT_ENDIAN;
266
267 *out = flag & UCONV_OUT_ENDIAN_MASKS;
268
269 /* You cannot have both. */
270 if (*out == UCONV_OUT_ENDIAN_MASKS)
271 return (EBADF);
272
273 if (*out == 0)
274 *out = UCONV_OUT_NAT_ENDIAN;
275
276 return (0);
277 }
278
279 static boolean_t
280 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
281 {
282 if (u16l > 0) {
283 if (*u16s == UCONV_BOM_NORMAL) {
284 *in = UCONV_IN_NAT_ENDIAN;
285 return (B_TRUE);
286 }
287 if (*u16s == UCONV_BOM_SWAPPED) {
288 *in = UCONV_IN_REV_ENDIAN;
289 return (B_TRUE);
290 }
291 }
292
293 return (B_FALSE);
294 }
295
296 static boolean_t
297 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
298 {
299 if (u32l > 0) {
300 if (*u32s == UCONV_BOM_NORMAL) {
301 *in = UCONV_IN_NAT_ENDIAN;
302 return (B_TRUE);
303 }
304 if (*u32s == UCONV_BOM_SWAPPED_32) {
305 *in = UCONV_IN_REV_ENDIAN;
306 return (B_TRUE);
307 }
308 }
309
310 return (B_FALSE);
311 }
312
313 int
314 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
315 uint32_t *u32s, size_t *utf32len, int flag)
316 {
317 int inendian;
318 int outendian;
319 size_t u16l;
320 size_t u32l;
321 uint32_t hi;
322 uint32_t lo;
323 boolean_t do_not_ignore_null;
324
325 /*
326 * Do preliminary validity checks on parameters and collect info on
327 * endians.
328 */
329 if (u16s == NULL || utf16len == NULL)
330 return (EILSEQ);
331
332 if (u32s == NULL || utf32len == NULL)
333 return (E2BIG);
334
335 if (check_endian(flag, &inendian, &outendian) != 0)
336 return (EBADF);
337
338 /*
339 * Initialize input and output parameter buffer indices and
340 * temporary variables.
341 */
342 u16l = u32l = 0;
343 hi = 0;
344 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
345
346 /*
347 * Check on the BOM at the beginning of the input buffer if required
348 * and if there is indeed one, process it.
349 */
350 if ((flag & UCONV_IN_ACCEPT_BOM) &&
351 check_bom16(u16s, *utf16len, &inendian))
352 u16l++;
353
354 /*
355 * Reset inendian and outendian so that after this point, those can be
356 * used as condition values.
357 */
358 inendian &= UCONV_IN_NAT_ENDIAN;
359 outendian &= UCONV_OUT_NAT_ENDIAN;
360
361 /*
362 * If there is something in the input buffer and if necessary and
363 * requested, save the BOM at the output buffer.
364 */
365 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
366 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
367 UCONV_BOM_SWAPPED_32;
368
369 /*
370 * Do conversion; if encounter a surrogate pair, assemble high and
371 * low pair values to form a UTF-32 character. If a half of a pair
372 * exists alone, then, either it is an illegal (EILSEQ) or
373 * invalid (EINVAL) value.
374 */
375 for (; u16l < *utf16len; u16l++) {
376 if (u16s[u16l] == 0 && do_not_ignore_null)
377 break;
378
379 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
380
381 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
382 if (hi)
383 return (EILSEQ);
384 hi = lo;
385 continue;
386 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
387 if (! hi)
388 return (EILSEQ);
389 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
390 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
391 + UCONV_U16_START;
392 hi = 0;
393 } else if (hi) {
394 return (EILSEQ);
395 }
396
397 if (u32l >= *utf32len)
398 return (E2BIG);
399
400 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
401 }
402
403 /*
404 * If high half didn't see low half, then, it's most likely the input
405 * parameter is incomplete.
406 */
407 if (hi)
408 return (EINVAL);
409
410 /*
411 * Save the number of consumed and saved characters. They do not
412 * include terminating NULL character (U+0000) at the end of
413 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
414 * the input buffer length is big enough to include the terminating
415 * NULL character).
416 */
417 *utf16len = u16l;
418 *utf32len = u32l;
419
420 return (0);
421 }
422
423 int
424 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
425 uchar_t *u8s, size_t *utf8len, int flag)
426 {
427 int inendian;
428 int outendian;
429 size_t u16l;
430 size_t u8l;
431 uint32_t hi;
432 uint32_t lo;
433 boolean_t do_not_ignore_null;
434
435 if (u16s == NULL || utf16len == NULL)
436 return (EILSEQ);
437
438 if (u8s == NULL || utf8len == NULL)
439 return (E2BIG);
440
441 if (check_endian(flag, &inendian, &outendian) != 0)
442 return (EBADF);
443
444 u16l = u8l = 0;
445 hi = 0;
446 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
447
448 if ((flag & UCONV_IN_ACCEPT_BOM) &&
449 check_bom16(u16s, *utf16len, &inendian))
450 u16l++;
451
452 inendian &= UCONV_IN_NAT_ENDIAN;
453
454 for (; u16l < *utf16len; u16l++) {
455 if (u16s[u16l] == 0 && do_not_ignore_null)
456 break;
457
458 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
459
460 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
461 if (hi)
462 return (EILSEQ);
463 hi = lo;
464 continue;
465 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
466 if (! hi)
467 return (EILSEQ);
468 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
469 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
470 + UCONV_U16_START;
471 hi = 0;
472 } else if (hi) {
473 return (EILSEQ);
474 }
475
476 /*
477 * Now we convert a UTF-32 character into a UTF-8 character.
478 * Unicode coding space is between U+0000 and U+10FFFF;
479 * anything bigger is an illegal character.
480 */
481 if (lo <= UCONV_U8_ONE_BYTE) {
482 if (u8l >= *utf8len)
483 return (E2BIG);
484 u8s[u8l++] = (uchar_t)lo;
485 } else if (lo <= UCONV_U8_TWO_BYTES) {
486 if ((u8l + 1) >= *utf8len)
487 return (E2BIG);
488 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
489 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
490 } else if (lo <= UCONV_U8_THREE_BYTES) {
491 if ((u8l + 2) >= *utf8len)
492 return (E2BIG);
493 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
494 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
495 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
496 } else if (lo <= UCONV_U8_FOUR_BYTES) {
497 if ((u8l + 3) >= *utf8len)
498 return (E2BIG);
499 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
500 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
501 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
502 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
503 } else {
504 return (EILSEQ);
505 }
506 }
507
508 if (hi)
509 return (EINVAL);
510
511 *utf16len = u16l;
512 *utf8len = u8l;
513
514 return (0);
515 }
516
517 int
518 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
519 uint16_t *u16s, size_t *utf16len, int flag)
520 {
521 int inendian;
522 int outendian;
523 size_t u16l;
524 size_t u32l;
525 uint32_t hi;
526 uint32_t lo;
527 boolean_t do_not_ignore_null;
528
529 if (u32s == NULL || utf32len == NULL)
530 return (EILSEQ);
531
532 if (u16s == NULL || utf16len == NULL)
533 return (E2BIG);
534
535 if (check_endian(flag, &inendian, &outendian) != 0)
536 return (EBADF);
537
538 u16l = u32l = 0;
539 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
540
541 if ((flag & UCONV_IN_ACCEPT_BOM) &&
542 check_bom32(u32s, *utf32len, &inendian))
543 u32l++;
544
545 inendian &= UCONV_IN_NAT_ENDIAN;
546 outendian &= UCONV_OUT_NAT_ENDIAN;
547
548 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
549 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
550 UCONV_BOM_SWAPPED;
551
552 for (; u32l < *utf32len; u32l++) {
553 if (u32s[u32l] == 0 && do_not_ignore_null)
554 break;
555
556 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
557
558 /*
559 * Anything bigger than the Unicode coding space, i.e.,
560 * Unicode scalar value bigger than U+10FFFF, is an illegal
561 * character.
562 */
563 if (hi > UCONV_UNICODE_MAX)
564 return (EILSEQ);
565
566 /*
567 * Anything bigger than U+FFFF must be converted into
568 * a surrogate pair in UTF-16.
569 */
570 if (hi >= UCONV_U16_START) {
571 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
572 UCONV_U16_LO_MIN;
573 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
574 UCONV_U16_HI_MIN;
575
576 if ((u16l + 1) >= *utf16len)
577 return (E2BIG);
578
579 if (outendian) {
580 u16s[u16l++] = (uint16_t)hi;
581 u16s[u16l++] = (uint16_t)lo;
582 } else {
583 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
584 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
585 }
586 } else {
587 if (u16l >= *utf16len)
588 return (E2BIG);
589 u16s[u16l++] = (outendian) ? (uint16_t)hi :
590 BSWAP_16(((uint16_t)hi));
591 }
592 }
593
594 *utf16len = u16l;
595 *utf32len = u32l;
596
597 return (0);
598 }
599
600 int
601 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
602 uchar_t *u8s, size_t *utf8len, int flag)
603 {
604 int inendian;
605 int outendian;
606 size_t u32l;
607 size_t u8l;
608 uint32_t lo;
609 boolean_t do_not_ignore_null;
610
611 if (u32s == NULL || utf32len == NULL)
612 return (EILSEQ);
613
614 if (u8s == NULL || utf8len == NULL)
615 return (E2BIG);
616
617 if (check_endian(flag, &inendian, &outendian) != 0)
618 return (EBADF);
619
620 u32l = u8l = 0;
621 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
622
623 if ((flag & UCONV_IN_ACCEPT_BOM) &&
624 check_bom32(u32s, *utf32len, &inendian))
625 u32l++;
626
627 inendian &= UCONV_IN_NAT_ENDIAN;
628
629 for (; u32l < *utf32len; u32l++) {
630 if (u32s[u32l] == 0 && do_not_ignore_null)
631 break;
632
633 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
634
635 if (lo <= UCONV_U8_ONE_BYTE) {
636 if (u8l >= *utf8len)
637 return (E2BIG);
638 u8s[u8l++] = (uchar_t)lo;
639 } else if (lo <= UCONV_U8_TWO_BYTES) {
640 if ((u8l + 1) >= *utf8len)
641 return (E2BIG);
642 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
643 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
644 } else if (lo <= UCONV_U8_THREE_BYTES) {
645 if ((u8l + 2) >= *utf8len)
646 return (E2BIG);
647 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
648 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
649 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
650 } else if (lo <= UCONV_U8_FOUR_BYTES) {
651 if ((u8l + 3) >= *utf8len)
652 return (E2BIG);
653 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
654 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
655 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
656 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
657 } else {
658 return (EILSEQ);
659 }
660 }
661
662 *utf32len = u32l;
663 *utf8len = u8l;
664
665 return (0);
666 }
667
668 int
669 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
670 uint16_t *u16s, size_t *utf16len, int flag)
671 {
672 int inendian;
673 int outendian;
674 size_t u16l;
675 size_t u8l;
676 uint32_t hi;
677 uint32_t lo;
678 int remaining_bytes;
679 int first_b;
680 boolean_t do_not_ignore_null;
681
682 if (u8s == NULL || utf8len == NULL)
683 return (EILSEQ);
684
685 if (u16s == NULL || utf16len == NULL)
686 return (E2BIG);
687
688 if (check_endian(flag, &inendian, &outendian) != 0)
689 return (EBADF);
690
691 u16l = u8l = 0;
692 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
693
694 outendian &= UCONV_OUT_NAT_ENDIAN;
695
696 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
697 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
698 UCONV_BOM_SWAPPED;
699
700 for (; u8l < *utf8len; ) {
701 if (u8s[u8l] == 0 && do_not_ignore_null)
702 break;
703
704 /*
705 * Collect a UTF-8 character and convert it to a UTF-32
706 * character. In doing so, we screen out illegally formed
707 * UTF-8 characters and treat such as illegal characters.
708 * The algorithm at below also screens out anything bigger
709 * than the U+10FFFF.
710 *
711 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
712 * more details on the illegal values of UTF-8 character
713 * bytes.
714 */
715 hi = (uint32_t)u8s[u8l++];
716
717 if (hi > UCONV_ASCII_MAX) {
718 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
719 return (EILSEQ);
720
721 first_b = hi;
722 hi = hi & u8_masks_tbl[remaining_bytes];
723
724 for (; remaining_bytes > 0; remaining_bytes--) {
725 /*
726 * If we have no more bytes, the current
727 * UTF-8 character is incomplete.
728 */
729 if (u8l >= *utf8len)
730 return (EINVAL);
731
732 lo = (uint32_t)u8s[u8l++];
733
734 if (first_b) {
735 if (lo < valid_min_2nd_byte[first_b] ||
736 lo > valid_max_2nd_byte[first_b])
737 return (EILSEQ);
738 first_b = 0;
739 } else if (lo < UCONV_U8_BYTE_MIN ||
740 lo > UCONV_U8_BYTE_MAX) {
741 return (EILSEQ);
742 }
743 hi = (hi << UCONV_U8_BIT_SHIFT) |
744 (lo & UCONV_U8_BIT_MASK);
745 }
746 }
747
748 if (hi >= UCONV_U16_START) {
749 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
750 UCONV_U16_LO_MIN;
751 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
752 UCONV_U16_HI_MIN;
753
754 if ((u16l + 1) >= *utf16len)
755 return (E2BIG);
756
757 if (outendian) {
758 u16s[u16l++] = (uint16_t)hi;
759 u16s[u16l++] = (uint16_t)lo;
760 } else {
761 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
762 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
763 }
764 } else {
765 if (u16l >= *utf16len)
766 return (E2BIG);
767
768 u16s[u16l++] = (outendian) ? (uint16_t)hi :
769 BSWAP_16(((uint16_t)hi));
770 }
771 }
772
773 *utf16len = u16l;
774 *utf8len = u8l;
775
776 return (0);
777 }
778
779 int
780 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
781 uint32_t *u32s, size_t *utf32len, int flag)
782 {
783 int inendian;
784 int outendian;
785 size_t u32l;
786 size_t u8l;
787 uint32_t hi;
788 uint32_t c;
789 int remaining_bytes;
790 int first_b;
791 boolean_t do_not_ignore_null;
792
793 if (u8s == NULL || utf8len == NULL)
794 return (EILSEQ);
795
796 if (u32s == NULL || utf32len == NULL)
797 return (E2BIG);
798
799 if (check_endian(flag, &inendian, &outendian) != 0)
800 return (EBADF);
801
802 u32l = u8l = 0;
803 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
804
805 outendian &= UCONV_OUT_NAT_ENDIAN;
806
807 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
808 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
809 UCONV_BOM_SWAPPED_32;
810
811 for (; u8l < *utf8len; ) {
812 if (u8s[u8l] == 0 && do_not_ignore_null)
813 break;
814
815 hi = (uint32_t)u8s[u8l++];
816
817 if (hi > UCONV_ASCII_MAX) {
818 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
819 return (EILSEQ);
820
821 first_b = hi;
822 hi = hi & u8_masks_tbl[remaining_bytes];
823
824 for (; remaining_bytes > 0; remaining_bytes--) {
825 if (u8l >= *utf8len)
826 return (EINVAL);
827
828 c = (uint32_t)u8s[u8l++];
829
830 if (first_b) {
831 if (c < valid_min_2nd_byte[first_b] ||
832 c > valid_max_2nd_byte[first_b])
833 return (EILSEQ);
834 first_b = 0;
835 } else if (c < UCONV_U8_BYTE_MIN ||
836 c > UCONV_U8_BYTE_MAX) {
837 return (EILSEQ);
838 }
839 hi = (hi << UCONV_U8_BIT_SHIFT) |
840 (c & UCONV_U8_BIT_MASK);
841 }
842 }
843
844 if (u32l >= *utf32len)
845 return (E2BIG);
846
847 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
848 }
849
850 *utf32len = u32l;
851 *utf8len = u8l;
852
853 return (0);
854 }
855
856 #if defined(_KERNEL)
857 EXPORT_SYMBOL(uconv_u16tou32);
858 EXPORT_SYMBOL(uconv_u16tou8);
859 EXPORT_SYMBOL(uconv_u32tou16);
860 EXPORT_SYMBOL(uconv_u32tou8);
861 EXPORT_SYMBOL(uconv_u8tou16);
862 EXPORT_SYMBOL(uconv_u8tou32);
863 #endif