]> git.proxmox.com Git - mirror_zfs.git/blame - module/unicode/uconv.c
BRT: Make BRT block sizes configurable
[mirror_zfs.git] / module / unicode / uconv.c
CommitLineData
42bcb36c
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
42bcb36c
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
e5dc681a 26
42bcb36c
BB
27
28/*
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
35 */
36
37#include <sys/types.h>
38#ifdef _KERNEL
39#include <sys/param.h>
40#include <sys/sysmacros.h>
42bcb36c
BB
41#include <sys/debug.h>
42#include <sys/kmem.h>
43#include <sys/sunddi.h>
44#else
45#include <sys/u8_textprep.h>
46#endif /* _KERNEL */
47#include <sys/byteorder.h>
48#include <sys/errno.h>
49
50
51/*
52 * The max and min values of high and low surrogate pairs of UTF-16,
53 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
54 */
55#define UCONV_U16_HI_MIN (0xd800U)
56#define UCONV_U16_HI_MAX (0xdbffU)
57#define UCONV_U16_LO_MIN (0xdc00U)
58#define UCONV_U16_LO_MAX (0xdfffU)
59#define UCONV_U16_BIT_SHIFT (0x0400U)
60#define UCONV_U16_BIT_MASK (0x0fffffU)
61#define UCONV_U16_START (0x010000U)
62
63/* The maximum value of Unicode coding space and ASCII coding space. */
64#define UCONV_UNICODE_MAX (0x10ffffU)
65#define UCONV_ASCII_MAX (0x7fU)
66
67/* The mask values for input and output endians. */
68#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
70
71/* Native and reversed endian macros. */
5678d3f5 72#ifdef _ZFS_BIG_ENDIAN
42bcb36c
BB
73#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
74#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
75#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
76#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
77#else
78#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
79#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
80#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
81#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
82#endif /* _BIG_ENDIAN */
83
84/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85#define UCONV_BOM_NORMAL (0xfeffU)
86#define UCONV_BOM_SWAPPED (0xfffeU)
87#define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
88
89/* UTF-32 boundaries based on UTF-8 character byte lengths. */
90#define UCONV_U8_ONE_BYTE (0x7fU)
91#define UCONV_U8_TWO_BYTES (0x7ffU)
92#define UCONV_U8_THREE_BYTES (0xffffU)
93#define UCONV_U8_FOUR_BYTES (0x10ffffU)
94
95/* The common minimum and maximum values at the UTF-8 character bytes. */
96#define UCONV_U8_BYTE_MIN (0x80U)
97#define UCONV_U8_BYTE_MAX (0xbfU)
98
99/*
100 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101 * UTF-8 character bytes.
102 */
103#define UCONV_U8_BIT_SHIFT 6
104#define UCONV_U8_BIT_MASK 0x3f
105
106/*
107 * The following vector shows remaining bytes in a UTF-8 character.
108 * Index will be the first byte of the character.
109 */
110static const uchar_t remaining_bytes_tbl[0x100] = {
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123
124/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
125 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126
127/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129
130/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132
133/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
134 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
135};
136
137/*
138 * The following is a vector of bit-masks to get used bits in
139 * the first byte of a UTF-8 character. Index is remaining bytes at above of
140 * the character.
141 */
42bcb36c 142static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
42bcb36c
BB
143
144/*
145 * The following two vectors are to provide valid minimum and
146 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
147 * better illegal sequence checking. The index value must be the value of
148 * the first byte of the UTF-8 character.
149 */
150static const uchar_t valid_min_2nd_byte[0x100] = {
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175
176/* C0 C1 C2 C3 C4 C5 C6 C7 */
177 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
178
179/* C8 C9 CA CB CC CD CE CF */
180 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
181
182/* D0 D1 D2 D3 D4 D5 D6 D7 */
183 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184
185/* D8 D9 DA DB DC DD DE DF */
186 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187
188/* E0 E1 E2 E3 E4 E5 E6 E7 */
189 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190
191/* E8 E9 EA EB EC ED EE EF */
192 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193
194/* F0 F1 F2 F3 F4 F5 F6 F7 */
195 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
196
197 0, 0, 0, 0, 0, 0, 0, 0
198};
199
200static const uchar_t valid_max_2nd_byte[0x100] = {
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225
226/* C0 C1 C2 C3 C4 C5 C6 C7 */
227 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
228
229/* C8 C9 CA CB CC CD CE CF */
230 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
231
232/* D0 D1 D2 D3 D4 D5 D6 D7 */
233 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234
235/* D8 D9 DA DB DC DD DE DF */
236 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237
238/* E0 E1 E2 E3 E4 E5 E6 E7 */
239 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240
241/* E8 E9 EA EB EC ED EE EF */
242 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
243
244/* F0 F1 F2 F3 F4 F5 F6 F7 */
245 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
246
247 0, 0, 0, 0, 0, 0, 0, 0
248};
249
250
251static int
252check_endian(int flag, int *in, int *out)
253{
254 *in = flag & UCONV_IN_ENDIAN_MASKS;
255
256 /* You cannot have both. */
257 if (*in == UCONV_IN_ENDIAN_MASKS)
258 return (EBADF);
259
260 if (*in == 0)
261 *in = UCONV_IN_NAT_ENDIAN;
262
263 *out = flag & UCONV_OUT_ENDIAN_MASKS;
264
265 /* You cannot have both. */
266 if (*out == UCONV_OUT_ENDIAN_MASKS)
267 return (EBADF);
268
269 if (*out == 0)
270 *out = UCONV_OUT_NAT_ENDIAN;
271
272 return (0);
273}
274
275static boolean_t
276check_bom16(const uint16_t *u16s, size_t u16l, int *in)
277{
278 if (u16l > 0) {
279 if (*u16s == UCONV_BOM_NORMAL) {
280 *in = UCONV_IN_NAT_ENDIAN;
281 return (B_TRUE);
282 }
283 if (*u16s == UCONV_BOM_SWAPPED) {
284 *in = UCONV_IN_REV_ENDIAN;
285 return (B_TRUE);
286 }
287 }
288
289 return (B_FALSE);
290}
291
292static boolean_t
293check_bom32(const uint32_t *u32s, size_t u32l, int *in)
294{
295 if (u32l > 0) {
296 if (*u32s == UCONV_BOM_NORMAL) {
297 *in = UCONV_IN_NAT_ENDIAN;
298 return (B_TRUE);
299 }
300 if (*u32s == UCONV_BOM_SWAPPED_32) {
301 *in = UCONV_IN_REV_ENDIAN;
302 return (B_TRUE);
303 }
304 }
305
306 return (B_FALSE);
307}
308
309int
310uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
311 uint32_t *u32s, size_t *utf32len, int flag)
312{
313 int inendian;
314 int outendian;
315 size_t u16l;
316 size_t u32l;
317 uint32_t hi;
318 uint32_t lo;
319 boolean_t do_not_ignore_null;
320
321 /*
322 * Do preliminary validity checks on parameters and collect info on
323 * endians.
324 */
325 if (u16s == NULL || utf16len == NULL)
326 return (EILSEQ);
327
328 if (u32s == NULL || utf32len == NULL)
329 return (E2BIG);
330
331 if (check_endian(flag, &inendian, &outendian) != 0)
332 return (EBADF);
333
334 /*
335 * Initialize input and output parameter buffer indices and
336 * temporary variables.
337 */
338 u16l = u32l = 0;
339 hi = 0;
340 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
341
342 /*
343 * Check on the BOM at the beginning of the input buffer if required
344 * and if there is indeed one, process it.
345 */
346 if ((flag & UCONV_IN_ACCEPT_BOM) &&
347 check_bom16(u16s, *utf16len, &inendian))
348 u16l++;
349
350 /*
351 * Reset inendian and outendian so that after this point, those can be
352 * used as condition values.
353 */
354 inendian &= UCONV_IN_NAT_ENDIAN;
355 outendian &= UCONV_OUT_NAT_ENDIAN;
356
357 /*
358 * If there is something in the input buffer and if necessary and
359 * requested, save the BOM at the output buffer.
360 */
361 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
362 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
363 UCONV_BOM_SWAPPED_32;
364
365 /*
366 * Do conversion; if encounter a surrogate pair, assemble high and
367 * low pair values to form a UTF-32 character. If a half of a pair
368 * exists alone, then, either it is an illegal (EILSEQ) or
369 * invalid (EINVAL) value.
370 */
371 for (; u16l < *utf16len; u16l++) {
372 if (u16s[u16l] == 0 && do_not_ignore_null)
373 break;
374
375 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
376
377 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
378 if (hi)
379 return (EILSEQ);
380 hi = lo;
381 continue;
382 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
383 if (! hi)
384 return (EILSEQ);
385 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
386 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
387 + UCONV_U16_START;
388 hi = 0;
389 } else if (hi) {
390 return (EILSEQ);
391 }
392
393 if (u32l >= *utf32len)
394 return (E2BIG);
395
396 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
397 }
398
399 /*
400 * If high half didn't see low half, then, it's most likely the input
401 * parameter is incomplete.
402 */
403 if (hi)
404 return (EINVAL);
405
406 /*
407 * Save the number of consumed and saved characters. They do not
408 * include terminating NULL character (U+0000) at the end of
409 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
410 * the input buffer length is big enough to include the terminating
411 * NULL character).
412 */
413 *utf16len = u16l;
414 *utf32len = u32l;
415
416 return (0);
417}
418
419int
420uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
421 uchar_t *u8s, size_t *utf8len, int flag)
422{
423 int inendian;
424 int outendian;
425 size_t u16l;
426 size_t u8l;
427 uint32_t hi;
428 uint32_t lo;
429 boolean_t do_not_ignore_null;
430
431 if (u16s == NULL || utf16len == NULL)
432 return (EILSEQ);
433
434 if (u8s == NULL || utf8len == NULL)
435 return (E2BIG);
436
437 if (check_endian(flag, &inendian, &outendian) != 0)
438 return (EBADF);
439
440 u16l = u8l = 0;
441 hi = 0;
442 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
443
444 if ((flag & UCONV_IN_ACCEPT_BOM) &&
445 check_bom16(u16s, *utf16len, &inendian))
446 u16l++;
447
448 inendian &= UCONV_IN_NAT_ENDIAN;
449
450 for (; u16l < *utf16len; u16l++) {
451 if (u16s[u16l] == 0 && do_not_ignore_null)
452 break;
453
454 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
455
456 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
457 if (hi)
458 return (EILSEQ);
459 hi = lo;
460 continue;
461 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
462 if (! hi)
463 return (EILSEQ);
464 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
465 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
466 + UCONV_U16_START;
467 hi = 0;
468 } else if (hi) {
469 return (EILSEQ);
470 }
471
472 /*
473 * Now we convert a UTF-32 character into a UTF-8 character.
474 * Unicode coding space is between U+0000 and U+10FFFF;
475 * anything bigger is an illegal character.
476 */
477 if (lo <= UCONV_U8_ONE_BYTE) {
478 if (u8l >= *utf8len)
479 return (E2BIG);
480 u8s[u8l++] = (uchar_t)lo;
481 } else if (lo <= UCONV_U8_TWO_BYTES) {
482 if ((u8l + 1) >= *utf8len)
483 return (E2BIG);
484 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
485 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
486 } else if (lo <= UCONV_U8_THREE_BYTES) {
487 if ((u8l + 2) >= *utf8len)
488 return (E2BIG);
489 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
490 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
491 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
492 } else if (lo <= UCONV_U8_FOUR_BYTES) {
493 if ((u8l + 3) >= *utf8len)
494 return (E2BIG);
495 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
496 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
497 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
498 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
499 } else {
500 return (EILSEQ);
501 }
502 }
503
504 if (hi)
505 return (EINVAL);
506
507 *utf16len = u16l;
508 *utf8len = u8l;
509
510 return (0);
511}
512
513int
514uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
515 uint16_t *u16s, size_t *utf16len, int flag)
516{
517 int inendian;
518 int outendian;
519 size_t u16l;
520 size_t u32l;
521 uint32_t hi;
522 uint32_t lo;
523 boolean_t do_not_ignore_null;
524
525 if (u32s == NULL || utf32len == NULL)
526 return (EILSEQ);
527
528 if (u16s == NULL || utf16len == NULL)
529 return (E2BIG);
530
531 if (check_endian(flag, &inendian, &outendian) != 0)
532 return (EBADF);
533
534 u16l = u32l = 0;
535 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
536
537 if ((flag & UCONV_IN_ACCEPT_BOM) &&
538 check_bom32(u32s, *utf32len, &inendian))
539 u32l++;
540
541 inendian &= UCONV_IN_NAT_ENDIAN;
542 outendian &= UCONV_OUT_NAT_ENDIAN;
543
544 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
545 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
546 UCONV_BOM_SWAPPED;
547
548 for (; u32l < *utf32len; u32l++) {
549 if (u32s[u32l] == 0 && do_not_ignore_null)
550 break;
551
552 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
553
554 /*
555 * Anything bigger than the Unicode coding space, i.e.,
556 * Unicode scalar value bigger than U+10FFFF, is an illegal
557 * character.
558 */
559 if (hi > UCONV_UNICODE_MAX)
560 return (EILSEQ);
561
562 /*
563 * Anything bigger than U+FFFF must be converted into
564 * a surrogate pair in UTF-16.
565 */
566 if (hi >= UCONV_U16_START) {
567 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
568 UCONV_U16_LO_MIN;
569 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
570 UCONV_U16_HI_MIN;
571
572 if ((u16l + 1) >= *utf16len)
573 return (E2BIG);
574
575 if (outendian) {
576 u16s[u16l++] = (uint16_t)hi;
577 u16s[u16l++] = (uint16_t)lo;
578 } else {
579 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
580 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
581 }
582 } else {
583 if (u16l >= *utf16len)
584 return (E2BIG);
585 u16s[u16l++] = (outendian) ? (uint16_t)hi :
586 BSWAP_16(((uint16_t)hi));
587 }
588 }
589
590 *utf16len = u16l;
591 *utf32len = u32l;
592
593 return (0);
594}
595
596int
597uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
598 uchar_t *u8s, size_t *utf8len, int flag)
599{
600 int inendian;
601 int outendian;
602 size_t u32l;
603 size_t u8l;
604 uint32_t lo;
605 boolean_t do_not_ignore_null;
606
607 if (u32s == NULL || utf32len == NULL)
608 return (EILSEQ);
609
610 if (u8s == NULL || utf8len == NULL)
611 return (E2BIG);
612
613 if (check_endian(flag, &inendian, &outendian) != 0)
614 return (EBADF);
615
616 u32l = u8l = 0;
617 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
618
619 if ((flag & UCONV_IN_ACCEPT_BOM) &&
620 check_bom32(u32s, *utf32len, &inendian))
621 u32l++;
622
623 inendian &= UCONV_IN_NAT_ENDIAN;
624
625 for (; u32l < *utf32len; u32l++) {
626 if (u32s[u32l] == 0 && do_not_ignore_null)
627 break;
628
629 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
630
631 if (lo <= UCONV_U8_ONE_BYTE) {
632 if (u8l >= *utf8len)
633 return (E2BIG);
634 u8s[u8l++] = (uchar_t)lo;
635 } else if (lo <= UCONV_U8_TWO_BYTES) {
636 if ((u8l + 1) >= *utf8len)
637 return (E2BIG);
638 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
639 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
640 } else if (lo <= UCONV_U8_THREE_BYTES) {
641 if ((u8l + 2) >= *utf8len)
642 return (E2BIG);
643 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
644 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
645 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
646 } else if (lo <= UCONV_U8_FOUR_BYTES) {
647 if ((u8l + 3) >= *utf8len)
648 return (E2BIG);
649 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
650 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
651 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
652 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
653 } else {
654 return (EILSEQ);
655 }
656 }
657
658 *utf32len = u32l;
659 *utf8len = u8l;
660
661 return (0);
662}
663
664int
665uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
666 uint16_t *u16s, size_t *utf16len, int flag)
667{
668 int inendian;
669 int outendian;
670 size_t u16l;
671 size_t u8l;
672 uint32_t hi;
673 uint32_t lo;
674 int remaining_bytes;
675 int first_b;
676 boolean_t do_not_ignore_null;
677
678 if (u8s == NULL || utf8len == NULL)
679 return (EILSEQ);
680
681 if (u16s == NULL || utf16len == NULL)
682 return (E2BIG);
683
684 if (check_endian(flag, &inendian, &outendian) != 0)
685 return (EBADF);
686
687 u16l = u8l = 0;
688 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
689
690 outendian &= UCONV_OUT_NAT_ENDIAN;
691
692 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
693 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
694 UCONV_BOM_SWAPPED;
695
696 for (; u8l < *utf8len; ) {
697 if (u8s[u8l] == 0 && do_not_ignore_null)
698 break;
699
700 /*
701 * Collect a UTF-8 character and convert it to a UTF-32
702 * character. In doing so, we screen out illegally formed
703 * UTF-8 characters and treat such as illegal characters.
704 * The algorithm at below also screens out anything bigger
705 * than the U+10FFFF.
706 *
707 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
708 * more details on the illegal values of UTF-8 character
709 * bytes.
710 */
711 hi = (uint32_t)u8s[u8l++];
712
713 if (hi > UCONV_ASCII_MAX) {
714 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
715 return (EILSEQ);
716
717 first_b = hi;
718 hi = hi & u8_masks_tbl[remaining_bytes];
719
720 for (; remaining_bytes > 0; remaining_bytes--) {
721 /*
722 * If we have no more bytes, the current
723 * UTF-8 character is incomplete.
724 */
725 if (u8l >= *utf8len)
726 return (EINVAL);
727
728 lo = (uint32_t)u8s[u8l++];
729
730 if (first_b) {
731 if (lo < valid_min_2nd_byte[first_b] ||
732 lo > valid_max_2nd_byte[first_b])
733 return (EILSEQ);
734 first_b = 0;
735 } else if (lo < UCONV_U8_BYTE_MIN ||
736 lo > UCONV_U8_BYTE_MAX) {
737 return (EILSEQ);
738 }
739 hi = (hi << UCONV_U8_BIT_SHIFT) |
740 (lo & UCONV_U8_BIT_MASK);
741 }
742 }
743
744 if (hi >= UCONV_U16_START) {
745 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
746 UCONV_U16_LO_MIN;
747 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
748 UCONV_U16_HI_MIN;
749
750 if ((u16l + 1) >= *utf16len)
751 return (E2BIG);
752
753 if (outendian) {
754 u16s[u16l++] = (uint16_t)hi;
755 u16s[u16l++] = (uint16_t)lo;
756 } else {
757 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
758 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
759 }
760 } else {
761 if (u16l >= *utf16len)
762 return (E2BIG);
763
764 u16s[u16l++] = (outendian) ? (uint16_t)hi :
765 BSWAP_16(((uint16_t)hi));
766 }
767 }
768
769 *utf16len = u16l;
770 *utf8len = u8l;
771
772 return (0);
773}
774
775int
776uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
777 uint32_t *u32s, size_t *utf32len, int flag)
778{
779 int inendian;
780 int outendian;
781 size_t u32l;
782 size_t u8l;
783 uint32_t hi;
784 uint32_t c;
785 int remaining_bytes;
786 int first_b;
787 boolean_t do_not_ignore_null;
788
789 if (u8s == NULL || utf8len == NULL)
790 return (EILSEQ);
791
792 if (u32s == NULL || utf32len == NULL)
793 return (E2BIG);
794
795 if (check_endian(flag, &inendian, &outendian) != 0)
796 return (EBADF);
797
798 u32l = u8l = 0;
799 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
800
801 outendian &= UCONV_OUT_NAT_ENDIAN;
802
803 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
804 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
805 UCONV_BOM_SWAPPED_32;
806
807 for (; u8l < *utf8len; ) {
808 if (u8s[u8l] == 0 && do_not_ignore_null)
809 break;
810
811 hi = (uint32_t)u8s[u8l++];
812
813 if (hi > UCONV_ASCII_MAX) {
814 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
815 return (EILSEQ);
816
817 first_b = hi;
818 hi = hi & u8_masks_tbl[remaining_bytes];
819
820 for (; remaining_bytes > 0; remaining_bytes--) {
821 if (u8l >= *utf8len)
822 return (EINVAL);
823
824 c = (uint32_t)u8s[u8l++];
825
826 if (first_b) {
827 if (c < valid_min_2nd_byte[first_b] ||
828 c > valid_max_2nd_byte[first_b])
829 return (EILSEQ);
830 first_b = 0;
831 } else if (c < UCONV_U8_BYTE_MIN ||
832 c > UCONV_U8_BYTE_MAX) {
833 return (EILSEQ);
834 }
835 hi = (hi << UCONV_U8_BIT_SHIFT) |
836 (c & UCONV_U8_BIT_MASK);
837 }
838 }
839
840 if (u32l >= *utf32len)
841 return (E2BIG);
842
843 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
844 }
845
846 *utf32len = u32l;
847 *utf8len = u8l;
848
849 return (0);
850}
c28b2279 851
93ce2b4c 852#if defined(_KERNEL)
c28b2279
BB
853EXPORT_SYMBOL(uconv_u16tou32);
854EXPORT_SYMBOL(uconv_u16tou8);
855EXPORT_SYMBOL(uconv_u32tou16);
856EXPORT_SYMBOL(uconv_u32tou8);
857EXPORT_SYMBOL(uconv_u8tou16);
858EXPORT_SYMBOL(uconv_u8tou32);
859#endif