module/unicode/uconv.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26
  27
  28 /*
  29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
  30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
  31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
  32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
  33  * the section 3C man pages.
  34  * Interface stability: Committed
  35  */
  36
  37 #include <sys/types.h>
  38 #ifdef  _KERNEL
  39 #include <sys/param.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/debug.h>
  42 #include <sys/kmem.h>
  43 #include <sys/sunddi.h>
  44 #else
  45 #include <sys/u8_textprep.h>
  46 #endif  /* _KERNEL */
  47 #include <sys/byteorder.h>
  48 #include <sys/errno.h>
  49
  50
  51 /*
  52  * The max and min values of high and low surrogate pairs of UTF-16,
  53  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
  54  */
  55 #define UCONV_U16_HI_MIN        (0xd800U)
  56 #define UCONV_U16_HI_MAX        (0xdbffU)
  57 #define UCONV_U16_LO_MIN        (0xdc00U)
  58 #define UCONV_U16_LO_MAX        (0xdfffU)
  59 #define UCONV_U16_BIT_SHIFT     (0x0400U)
  60 #define UCONV_U16_BIT_MASK      (0x0fffffU)
  61 #define UCONV_U16_START         (0x010000U)
  62
  63 /* The maximum value of Unicode coding space and ASCII coding space. */
  64 #define UCONV_UNICODE_MAX       (0x10ffffU)
  65 #define UCONV_ASCII_MAX         (0x7fU)
  66
  67 /* The mask values for input and output endians. */
  68 #define UCONV_IN_ENDIAN_MASKS   (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
  69 #define UCONV_OUT_ENDIAN_MASKS  (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
  70
  71 /* Native and reversed endian macros. */
  72 #ifdef  _BIG_ENDIAN
  73 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_BIG_ENDIAN
  74 #define UCONV_IN_REV_ENDIAN     UCONV_IN_LITTLE_ENDIAN
  75 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_BIG_ENDIAN
  76 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
  77 #else
  78 #define UCONV_IN_NAT_ENDIAN     UCONV_IN_LITTLE_ENDIAN
  79 #define UCONV_IN_REV_ENDIAN     UCONV_IN_BIG_ENDIAN
  80 #define UCONV_OUT_NAT_ENDIAN    UCONV_OUT_LITTLE_ENDIAN
  81 #define UCONV_OUT_REV_ENDIAN    UCONV_OUT_BIG_ENDIAN
  82 #endif  /* _BIG_ENDIAN */
  83
  84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
  85 #define UCONV_BOM_NORMAL        (0xfeffU)
  86 #define UCONV_BOM_SWAPPED       (0xfffeU)
  87 #define UCONV_BOM_SWAPPED_32    (0xfffe0000U)
  88
  89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
  90 #define UCONV_U8_ONE_BYTE       (0x7fU)
  91 #define UCONV_U8_TWO_BYTES      (0x7ffU)
  92 #define UCONV_U8_THREE_BYTES    (0xffffU)
  93 #define UCONV_U8_FOUR_BYTES     (0x10ffffU)
  94
  95 /* The common minimum and maximum values at the UTF-8 character bytes. */
  96 #define UCONV_U8_BYTE_MIN       (0x80U)
  97 #define UCONV_U8_BYTE_MAX       (0xbfU)
  98
  99 /*
 100  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
 101  * UTF-8 character bytes.
 102  */
 103 #define UCONV_U8_BIT_SHIFT      6
 104 #define UCONV_U8_BIT_MASK       0x3f
 105
 106 /*
 107  * The following vector shows remaining bytes in a UTF-8 character.
 108  * Index will be the first byte of the character.
 109  */
 110 static const uchar_t remaining_bytes_tbl[0x100] = {
 111         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 112         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 113         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 114         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 115         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 116         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 117         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 118         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 119         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 120         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 121         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 122         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 123
 124 /*      C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
 125         0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 126
 127 /*      D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
 128         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 129
 130 /*      E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
 131         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
 132
 133 /*      F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
 134         3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 135 };
 136
 137 /*
 138  * The following is a vector of bit-masks to get used bits in
 139  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
 140  * the character.
 141  */
 142 #ifdef  _KERNEL
 143 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 144 #else
 145 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 146 #endif  /* _KERNEL */
 147
 148 /*
 149  * The following two vectors are to provide valid minimum and
 150  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
 151  * better illegal sequence checking. The index value must be the value of
 152  * the first byte of the UTF-8 character.
 153  */
 154 static const uchar_t valid_min_2nd_byte[0x100] = {
 155         0,    0,    0,    0,    0,    0,    0,    0,
 156         0,    0,    0,    0,    0,    0,    0,    0,
 157         0,    0,    0,    0,    0,    0,    0,    0,
 158         0,    0,    0,    0,    0,    0,    0,    0,
 159         0,    0,    0,    0,    0,    0,    0,    0,
 160         0,    0,    0,    0,    0,    0,    0,    0,
 161         0,    0,    0,    0,    0,    0,    0,    0,
 162         0,    0,    0,    0,    0,    0,    0,    0,
 163         0,    0,    0,    0,    0,    0,    0,    0,
 164         0,    0,    0,    0,    0,    0,    0,    0,
 165         0,    0,    0,    0,    0,    0,    0,    0,
 166         0,    0,    0,    0,    0,    0,    0,    0,
 167         0,    0,    0,    0,    0,    0,    0,    0,
 168         0,    0,    0,    0,    0,    0,    0,    0,
 169         0,    0,    0,    0,    0,    0,    0,    0,
 170         0,    0,    0,    0,    0,    0,    0,    0,
 171         0,    0,    0,    0,    0,    0,    0,    0,
 172         0,    0,    0,    0,    0,    0,    0,    0,
 173         0,    0,    0,    0,    0,    0,    0,    0,
 174         0,    0,    0,    0,    0,    0,    0,    0,
 175         0,    0,    0,    0,    0,    0,    0,    0,
 176         0,    0,    0,    0,    0,    0,    0,    0,
 177         0,    0,    0,    0,    0,    0,    0,    0,
 178         0,    0,    0,    0,    0,    0,    0,    0,
 179
 180 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
 181         0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 182
 183 /*      C8    C9    CA    CB    CC    CD    CE    CF */
 184         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 185
 186 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
 187         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 188
 189 /*      D8    D9    DA    DB    DC    DD    DE    DF */
 190         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 191
 192 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
 193         0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 194
 195 /*      E8    E9    EA    EB    EC    ED    EE    EF */
 196         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 197
 198 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
 199         0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
 200
 201         0,    0,    0,    0,    0,    0,    0,    0
 202 };
 203
 204 static const uchar_t valid_max_2nd_byte[0x100] = {
 205         0,    0,    0,    0,    0,    0,    0,    0,
 206         0,    0,    0,    0,    0,    0,    0,    0,
 207         0,    0,    0,    0,    0,    0,    0,    0,
 208         0,    0,    0,    0,    0,    0,    0,    0,
 209         0,    0,    0,    0,    0,    0,    0,    0,
 210         0,    0,    0,    0,    0,    0,    0,    0,
 211         0,    0,    0,    0,    0,    0,    0,    0,
 212         0,    0,    0,    0,    0,    0,    0,    0,
 213         0,    0,    0,    0,    0,    0,    0,    0,
 214         0,    0,    0,    0,    0,    0,    0,    0,
 215         0,    0,    0,    0,    0,    0,    0,    0,
 216         0,    0,    0,    0,    0,    0,    0,    0,
 217         0,    0,    0,    0,    0,    0,    0,    0,
 218         0,    0,    0,    0,    0,    0,    0,    0,
 219         0,    0,    0,    0,    0,    0,    0,    0,
 220         0,    0,    0,    0,    0,    0,    0,    0,
 221         0,    0,    0,    0,    0,    0,    0,    0,
 222         0,    0,    0,    0,    0,    0,    0,    0,
 223         0,    0,    0,    0,    0,    0,    0,    0,
 224         0,    0,    0,    0,    0,    0,    0,    0,
 225         0,    0,    0,    0,    0,    0,    0,    0,
 226         0,    0,    0,    0,    0,    0,    0,    0,
 227         0,    0,    0,    0,    0,    0,    0,    0,
 228         0,    0,    0,    0,    0,    0,    0,    0,
 229
 230 /*      C0    C1    C2    C3    C4    C5    C6    C7 */
 231         0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 232
 233 /*      C8    C9    CA    CB    CC    CD    CE    CF */
 234         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 235
 236 /*      D0    D1    D2    D3    D4    D5    D6    D7 */
 237         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 238
 239 /*      D8    D9    DA    DB    DC    DD    DE    DF */
 240         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 241
 242 /*      E0    E1    E2    E3    E4    E5    E6    E7 */
 243         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
 244
 245 /*      E8    E9    EA    EB    EC    ED    EE    EF */
 246         0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
 247
 248 /*      F0    F1    F2    F3    F4    F5    F6    F7 */
 249         0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
 250
 251         0,    0,    0,    0,    0,    0,    0,    0
 252 };
 253
 254
 255 static int
 256 check_endian(int flag, int *in, int *out)
 257 {
 258         *in = flag & UCONV_IN_ENDIAN_MASKS;
 259
 260         /* You cannot have both. */
 261         if (*in == UCONV_IN_ENDIAN_MASKS)
 262                 return (EBADF);
 263
 264         if (*in == 0)
 265                 *in = UCONV_IN_NAT_ENDIAN;
 266
 267         *out = flag & UCONV_OUT_ENDIAN_MASKS;
 268
 269         /* You cannot have both. */
 270         if (*out == UCONV_OUT_ENDIAN_MASKS)
 271                 return (EBADF);
 272
 273         if (*out == 0)
 274                 *out = UCONV_OUT_NAT_ENDIAN;
 275
 276         return (0);
 277 }
 278
 279 static boolean_t
 280 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
 281 {
 282         if (u16l > 0) {
 283                 if (*u16s == UCONV_BOM_NORMAL) {
 284                         *in = UCONV_IN_NAT_ENDIAN;
 285                         return (B_TRUE);
 286                 }
 287                 if (*u16s == UCONV_BOM_SWAPPED) {
 288                         *in = UCONV_IN_REV_ENDIAN;
 289                         return (B_TRUE);
 290                 }
 291         }
 292
 293         return (B_FALSE);
 294 }
 295
 296 static boolean_t
 297 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
 298 {
 299         if (u32l > 0) {
 300                 if (*u32s == UCONV_BOM_NORMAL) {
 301                         *in = UCONV_IN_NAT_ENDIAN;
 302                         return (B_TRUE);
 303                 }
 304                 if (*u32s == UCONV_BOM_SWAPPED_32) {
 305                         *in = UCONV_IN_REV_ENDIAN;
 306                         return (B_TRUE);
 307                 }
 308         }
 309
 310         return (B_FALSE);
 311 }
 312
 313 int
 314 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
 315     uint32_t *u32s, size_t *utf32len, int flag)
 316 {
 317         int inendian;
 318         int outendian;
 319         size_t u16l;
 320         size_t u32l;
 321         uint32_t hi;
 322         uint32_t lo;
 323         boolean_t do_not_ignore_null;
 324
 325         /*
 326          * Do preliminary validity checks on parameters and collect info on
 327          * endians.
 328          */
 329         if (u16s == NULL || utf16len == NULL)
 330                 return (EILSEQ);
 331
 332         if (u32s == NULL || utf32len == NULL)
 333                 return (E2BIG);
 334
 335         if (check_endian(flag, &inendian, &outendian) != 0)
 336                 return (EBADF);
 337
 338         /*
 339          * Initialize input and output parameter buffer indices and
 340          * temporary variables.
 341          */
 342         u16l = u32l = 0;
 343         hi = 0;
 344         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 345
 346         /*
 347          * Check on the BOM at the beginning of the input buffer if required
 348          * and if there is indeed one, process it.
 349          */
 350         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 351             check_bom16(u16s, *utf16len, &inendian))
 352                 u16l++;
 353
 354         /*
 355          * Reset inendian and outendian so that after this point, those can be
 356          * used as condition values.
 357          */
 358         inendian &= UCONV_IN_NAT_ENDIAN;
 359         outendian &= UCONV_OUT_NAT_ENDIAN;
 360
 361         /*
 362          * If there is something in the input buffer and if necessary and
 363          * requested, save the BOM at the output buffer.
 364          */
 365         if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 366                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
 367                     UCONV_BOM_SWAPPED_32;
 368
 369         /*
 370          * Do conversion; if encounter a surrogate pair, assemble high and
 371          * low pair values to form a UTF-32 character. If a half of a pair
 372          * exists alone, then, either it is an illegal (EILSEQ) or
 373          * invalid (EINVAL) value.
 374          */
 375         for (; u16l < *utf16len; u16l++) {
 376                 if (u16s[u16l] == 0 && do_not_ignore_null)
 377                         break;
 378
 379                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
 380
 381                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
 382                         if (hi)
 383                                 return (EILSEQ);
 384                         hi = lo;
 385                         continue;
 386                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
 387                         if (! hi)
 388                                 return (EILSEQ);
 389                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
 390                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
 391                             + UCONV_U16_START;
 392                         hi = 0;
 393                 } else if (hi) {
 394                         return (EILSEQ);
 395                 }
 396
 397                 if (u32l >= *utf32len)
 398                         return (E2BIG);
 399
 400                 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
 401         }
 402
 403         /*
 404          * If high half didn't see low half, then, it's most likely the input
 405          * parameter is incomplete.
 406          */
 407         if (hi)
 408                 return (EINVAL);
 409
 410         /*
 411          * Save the number of consumed and saved characters. They do not
 412          * include terminating NULL character (U+0000) at the end of
 413          * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
 414          * the input buffer length is big enough to include the terminating
 415          * NULL character).
 416          */
 417         *utf16len = u16l;
 418         *utf32len = u32l;
 419
 420         return (0);
 421 }
 422
 423 int
 424 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
 425     uchar_t *u8s, size_t *utf8len, int flag)
 426 {
 427         int inendian;
 428         int outendian;
 429         size_t u16l;
 430         size_t u8l;
 431         uint32_t hi;
 432         uint32_t lo;
 433         boolean_t do_not_ignore_null;
 434
 435         if (u16s == NULL || utf16len == NULL)
 436                 return (EILSEQ);
 437
 438         if (u8s == NULL || utf8len == NULL)
 439                 return (E2BIG);
 440
 441         if (check_endian(flag, &inendian, &outendian) != 0)
 442                 return (EBADF);
 443
 444         u16l = u8l = 0;
 445         hi = 0;
 446         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 447
 448         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 449             check_bom16(u16s, *utf16len, &inendian))
 450                 u16l++;
 451
 452         inendian &= UCONV_IN_NAT_ENDIAN;
 453
 454         for (; u16l < *utf16len; u16l++) {
 455                 if (u16s[u16l] == 0 && do_not_ignore_null)
 456                         break;
 457
 458                 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
 459
 460                 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
 461                         if (hi)
 462                                 return (EILSEQ);
 463                         hi = lo;
 464                         continue;
 465                 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
 466                         if (! hi)
 467                                 return (EILSEQ);
 468                         lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
 469                             lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
 470                             + UCONV_U16_START;
 471                         hi = 0;
 472                 } else if (hi) {
 473                         return (EILSEQ);
 474                 }
 475
 476                 /*
 477                  * Now we convert a UTF-32 character into a UTF-8 character.
 478                  * Unicode coding space is between U+0000 and U+10FFFF;
 479                  * anything bigger is an illegal character.
 480                  */
 481                 if (lo <= UCONV_U8_ONE_BYTE) {
 482                         if (u8l >= *utf8len)
 483                                 return (E2BIG);
 484                         u8s[u8l++] = (uchar_t)lo;
 485                 } else if (lo <= UCONV_U8_TWO_BYTES) {
 486                         if ((u8l + 1) >= *utf8len)
 487                                 return (E2BIG);
 488                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
 489                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
 490                 } else if (lo <= UCONV_U8_THREE_BYTES) {
 491                         if ((u8l + 2) >= *utf8len)
 492                                 return (E2BIG);
 493                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
 494                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
 495                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
 496                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
 497                         if ((u8l + 3) >= *utf8len)
 498                                 return (E2BIG);
 499                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
 500                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
 501                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
 502                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
 503                 } else {
 504                         return (EILSEQ);
 505                 }
 506         }
 507
 508         if (hi)
 509                 return (EINVAL);
 510
 511         *utf16len = u16l;
 512         *utf8len = u8l;
 513
 514         return (0);
 515 }
 516
 517 int
 518 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
 519     uint16_t *u16s, size_t *utf16len, int flag)
 520 {
 521         int inendian;
 522         int outendian;
 523         size_t u16l;
 524         size_t u32l;
 525         uint32_t hi;
 526         uint32_t lo;
 527         boolean_t do_not_ignore_null;
 528
 529         if (u32s == NULL || utf32len == NULL)
 530                 return (EILSEQ);
 531
 532         if (u16s == NULL || utf16len == NULL)
 533                 return (E2BIG);
 534
 535         if (check_endian(flag, &inendian, &outendian) != 0)
 536                 return (EBADF);
 537
 538         u16l = u32l = 0;
 539         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 540
 541         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 542             check_bom32(u32s, *utf32len, &inendian))
 543                 u32l++;
 544
 545         inendian &= UCONV_IN_NAT_ENDIAN;
 546         outendian &= UCONV_OUT_NAT_ENDIAN;
 547
 548         if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 549                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
 550                     UCONV_BOM_SWAPPED;
 551
 552         for (; u32l < *utf32len; u32l++) {
 553                 if (u32s[u32l] == 0 && do_not_ignore_null)
 554                         break;
 555
 556                 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
 557
 558                 /*
 559                  * Anything bigger than the Unicode coding space, i.e.,
 560                  * Unicode scalar value bigger than U+10FFFF, is an illegal
 561                  * character.
 562                  */
 563                 if (hi > UCONV_UNICODE_MAX)
 564                         return (EILSEQ);
 565
 566                 /*
 567                  * Anything bigger than U+FFFF must be converted into
 568                  * a surrogate pair in UTF-16.
 569                  */
 570                 if (hi >= UCONV_U16_START) {
 571                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
 572                             UCONV_U16_LO_MIN;
 573                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
 574                             UCONV_U16_HI_MIN;
 575
 576                         if ((u16l + 1) >= *utf16len)
 577                                 return (E2BIG);
 578
 579                         if (outendian) {
 580                                 u16s[u16l++] = (uint16_t)hi;
 581                                 u16s[u16l++] = (uint16_t)lo;
 582                         } else {
 583                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
 584                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
 585                         }
 586                 } else {
 587                         if (u16l >= *utf16len)
 588                                 return (E2BIG);
 589                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
 590                             BSWAP_16(((uint16_t)hi));
 591                 }
 592         }
 593
 594         *utf16len = u16l;
 595         *utf32len = u32l;
 596
 597         return (0);
 598 }
 599
 600 int
 601 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
 602     uchar_t *u8s, size_t *utf8len, int flag)
 603 {
 604         int inendian;
 605         int outendian;
 606         size_t u32l;
 607         size_t u8l;
 608         uint32_t lo;
 609         boolean_t do_not_ignore_null;
 610
 611         if (u32s == NULL || utf32len == NULL)
 612                 return (EILSEQ);
 613
 614         if (u8s == NULL || utf8len == NULL)
 615                 return (E2BIG);
 616
 617         if (check_endian(flag, &inendian, &outendian) != 0)
 618                 return (EBADF);
 619
 620         u32l = u8l = 0;
 621         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 622
 623         if ((flag & UCONV_IN_ACCEPT_BOM) &&
 624             check_bom32(u32s, *utf32len, &inendian))
 625                 u32l++;
 626
 627         inendian &= UCONV_IN_NAT_ENDIAN;
 628
 629         for (; u32l < *utf32len; u32l++) {
 630                 if (u32s[u32l] == 0 && do_not_ignore_null)
 631                         break;
 632
 633                 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
 634
 635                 if (lo <= UCONV_U8_ONE_BYTE) {
 636                         if (u8l >= *utf8len)
 637                                 return (E2BIG);
 638                         u8s[u8l++] = (uchar_t)lo;
 639                 } else if (lo <= UCONV_U8_TWO_BYTES) {
 640                         if ((u8l + 1) >= *utf8len)
 641                                 return (E2BIG);
 642                         u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
 643                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
 644                 } else if (lo <= UCONV_U8_THREE_BYTES) {
 645                         if ((u8l + 2) >= *utf8len)
 646                                 return (E2BIG);
 647                         u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
 648                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
 649                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
 650                 } else if (lo <= UCONV_U8_FOUR_BYTES) {
 651                         if ((u8l + 3) >= *utf8len)
 652                                 return (E2BIG);
 653                         u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
 654                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
 655                         u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
 656                         u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
 657                 } else {
 658                         return (EILSEQ);
 659                 }
 660         }
 661
 662         *utf32len = u32l;
 663         *utf8len = u8l;
 664
 665         return (0);
 666 }
 667
 668 int
 669 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
 670     uint16_t *u16s, size_t *utf16len, int flag)
 671 {
 672         int inendian;
 673         int outendian;
 674         size_t u16l;
 675         size_t u8l;
 676         uint32_t hi;
 677         uint32_t lo;
 678         int remaining_bytes;
 679         int first_b;
 680         boolean_t do_not_ignore_null;
 681
 682         if (u8s == NULL || utf8len == NULL)
 683                 return (EILSEQ);
 684
 685         if (u16s == NULL || utf16len == NULL)
 686                 return (E2BIG);
 687
 688         if (check_endian(flag, &inendian, &outendian) != 0)
 689                 return (EBADF);
 690
 691         u16l = u8l = 0;
 692         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 693
 694         outendian &= UCONV_OUT_NAT_ENDIAN;
 695
 696         if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 697                 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
 698                     UCONV_BOM_SWAPPED;
 699
 700         for (; u8l < *utf8len; ) {
 701                 if (u8s[u8l] == 0 && do_not_ignore_null)
 702                         break;
 703
 704                 /*
 705                  * Collect a UTF-8 character and convert it to a UTF-32
 706                  * character. In doing so, we screen out illegally formed
 707                  * UTF-8 characters and treat such as illegal characters.
 708                  * The algorithm at below also screens out anything bigger
 709                  * than the U+10FFFF.
 710                  *
 711                  * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
 712                  * more details on the illegal values of UTF-8 character
 713                  * bytes.
 714                  */
 715                 hi = (uint32_t)u8s[u8l++];
 716
 717                 if (hi > UCONV_ASCII_MAX) {
 718                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
 719                                 return (EILSEQ);
 720
 721                         first_b = hi;
 722                         hi = hi & u8_masks_tbl[remaining_bytes];
 723
 724                         for (; remaining_bytes > 0; remaining_bytes--) {
 725                                 /*
 726                                  * If we have no more bytes, the current
 727                                  * UTF-8 character is incomplete.
 728                                  */
 729                                 if (u8l >= *utf8len)
 730                                         return (EINVAL);
 731
 732                                 lo = (uint32_t)u8s[u8l++];
 733
 734                                 if (first_b) {
 735                                         if (lo < valid_min_2nd_byte[first_b] ||
 736                                             lo > valid_max_2nd_byte[first_b])
 737                                                 return (EILSEQ);
 738                                         first_b = 0;
 739                                 } else if (lo < UCONV_U8_BYTE_MIN ||
 740                                     lo > UCONV_U8_BYTE_MAX) {
 741                                         return (EILSEQ);
 742                                 }
 743                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
 744                                     (lo & UCONV_U8_BIT_MASK);
 745                         }
 746                 }
 747
 748                 if (hi >= UCONV_U16_START) {
 749                         lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
 750                             UCONV_U16_LO_MIN;
 751                         hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
 752                             UCONV_U16_HI_MIN;
 753
 754                         if ((u16l + 1) >= *utf16len)
 755                                 return (E2BIG);
 756
 757                         if (outendian) {
 758                                 u16s[u16l++] = (uint16_t)hi;
 759                                 u16s[u16l++] = (uint16_t)lo;
 760                         } else {
 761                                 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
 762                                 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
 763                         }
 764                 } else {
 765                         if (u16l >= *utf16len)
 766                                 return (E2BIG);
 767
 768                         u16s[u16l++] = (outendian) ? (uint16_t)hi :
 769                             BSWAP_16(((uint16_t)hi));
 770                 }
 771         }
 772
 773         *utf16len = u16l;
 774         *utf8len = u8l;
 775
 776         return (0);
 777 }
 778
 779 int
 780 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
 781     uint32_t *u32s, size_t *utf32len, int flag)
 782 {
 783         int inendian;
 784         int outendian;
 785         size_t u32l;
 786         size_t u8l;
 787         uint32_t hi;
 788         uint32_t c;
 789         int remaining_bytes;
 790         int first_b;
 791         boolean_t do_not_ignore_null;
 792
 793         if (u8s == NULL || utf8len == NULL)
 794                 return (EILSEQ);
 795
 796         if (u32s == NULL || utf32len == NULL)
 797                 return (E2BIG);
 798
 799         if (check_endian(flag, &inendian, &outendian) != 0)
 800                 return (EBADF);
 801
 802         u32l = u8l = 0;
 803         do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
 804
 805         outendian &= UCONV_OUT_NAT_ENDIAN;
 806
 807         if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
 808                 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
 809                     UCONV_BOM_SWAPPED_32;
 810
 811         for (; u8l < *utf8len; ) {
 812                 if (u8s[u8l] == 0 && do_not_ignore_null)
 813                         break;
 814
 815                 hi = (uint32_t)u8s[u8l++];
 816
 817                 if (hi > UCONV_ASCII_MAX) {
 818                         if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
 819                                 return (EILSEQ);
 820
 821                         first_b = hi;
 822                         hi = hi & u8_masks_tbl[remaining_bytes];
 823
 824                         for (; remaining_bytes > 0; remaining_bytes--) {
 825                                 if (u8l >= *utf8len)
 826                                         return (EINVAL);
 827
 828                                 c = (uint32_t)u8s[u8l++];
 829
 830                                 if (first_b) {
 831                                         if (c < valid_min_2nd_byte[first_b] ||
 832                                             c > valid_max_2nd_byte[first_b])
 833                                                 return (EILSEQ);
 834                                         first_b = 0;
 835                                 } else if (c < UCONV_U8_BYTE_MIN ||
 836                                     c > UCONV_U8_BYTE_MAX) {
 837                                         return (EILSEQ);
 838                                 }
 839                                 hi = (hi << UCONV_U8_BIT_SHIFT) |
 840                                     (c & UCONV_U8_BIT_MASK);
 841                         }
 842                 }
 843
 844                 if (u32l >= *utf32len)
 845                         return (E2BIG);
 846
 847                 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
 848         }
 849
 850         *utf32len = u32l;
 851         *utf8len = u8l;
 852
 853         return (0);
 854 }
 855
 856 #if defined(_KERNEL)
 857 EXPORT_SYMBOL(uconv_u16tou32);
 858 EXPORT_SYMBOL(uconv_u16tou8);
 859 EXPORT_SYMBOL(uconv_u32tou16);
 860 EXPORT_SYMBOL(uconv_u32tou8);
 861 EXPORT_SYMBOL(uconv_u8tou16);
 862 EXPORT_SYMBOL(uconv_u8tou32);
 863 #endif