]> git.proxmox.com Git - mirror_edk2.git/blob - StdLib/LibC/Locale/multibyte_Utf8.c
3f29f2942af45debdfa33d821e732b4fc7e8264d
[mirror_edk2.git] / StdLib / LibC / Locale / multibyte_Utf8.c
1 /** @file
2 Copyright (c) 2012, Intel Corporation. All rights reserved.<BR>
3 This program and the accompanying materials
4 are licensed and made available under the terms and conditions of the BSD License
5 which accompanies this distribution. The full text of the license may be found at
6 http://opensource.org/licenses/bsd-license.php
7
8 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
9 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
10 **/
11 #include <assert.h>
12 #include <string.h>
13 #include <errno.h>
14 #include <stdlib.h>
15 #include <wchar.h>
16 #include <sys/types.h>
17
18 typedef int ch_UCS4;
19
20 static mbstate_t LocalConvState = {0};
21
22 /** Map a UTF-8 encoded prefix byte to a sequence length.
23 Zero means illegal prefix, but valid surrogate if < 0xC0.
24 One indicates an ASCII-7 equivalent character.
25 Two, three, and four are the first byte for 2, 3, and 4 byte sequences, respectively.
26 See RFC 3629 for details.
27
28 TABLE ENCODING:
29 Low Nibble decodes the first byte into the number of bytes in the sequence.
30 A value of zero indicates an invalid byte.
31 The High Nibble encodes a bit mask to be used to match against the high nibble of the second byte.
32
33 example:
34 SequenceLength = code[c0] & 0x0F;
35 Mask = 0x80 | code[c0];
36
37 Surrogate bytes are valid if: code[cX] & Mask > 0x80;
38
39 */
40 static
41 UINT8 utf8_code_length[256] = {
42 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 00-0F */
43 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
44 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
45 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
46 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
47 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
48 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
49 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 70-7F */
50 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, /* 80-8F */
51 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, /* 90-9F */
52 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* A0-AF */
53 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* B0-BF */
54 0x00, 0x00, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* C0-C1 + C2-CF */
55 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* D0-DF */
56 0x43, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x33, 0x73, 0x73, /* E0-EF */
57 0x64, 0x74, 0x74, 0x74, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* F0-F4 + F5-FF */
58 };
59
60 /** Process one byte of a multibyte character.
61
62 @param ch
63 @param ps
64
65 @retval -2
66 @retval -1
67 @retval 1:4
68 **/
69 static
70 int
71 ProcessOneByte(unsigned char ch, mbstate_t *ps)
72 {
73 UINT32 Mask;
74 UINT32 Length;
75 int RetVal = 0;
76
77 if(ps->A > 3) {
78 // We are in an invalid state
79 ps->A = 0; // Initial State
80 }
81 ps->C[ps->A] = ch; // Save the current character
82 Mask = utf8_code_length[ch];
83
84 if(ps->A == 0) { // Initial State. First byte of sequence.
85 ps->E = Mask | 0x80;
86 Length = Mask & 0xF;
87 switch(Length) {
88 case 0: // State 0, Code 0
89 errno = EILSEQ;
90 RetVal = -1;
91 ps->E = 1; // Consume this character
92 break;
93 case 1: // State 0, Code 1
94 // ASCII-7 Character
95 ps->B = ps->D[0] = ch;
96 RetVal = 1;
97 break;
98 default: // State 0, Code 2, 3, 4
99 ps->A = 1; // Next state is State-1
100 RetVal = -2; // Incomplete but potentially valid character
101 break;
102 }
103 }
104 else {
105 // We are in state 1, 2, or 3 and processing a surrogate byte
106 Length = ps->E & 0xF;
107 if((Mask & ps->E) > 0x80) {
108 // This byte is valid
109 switch(ps->A) { // Process based upon our current state
110 case 1: // Second byte of the sequence.
111 if(Length == 2) { // State 1, Code 2
112 Length = ((ps->C[0] & 0x1f) << 6) + (ps->C[1] & 0x3f);
113 assert ((Length > 0x007F) && (Length <= 0x07FF));
114 ps->B = ps->D[0] = (UINT16)Length;
115 ps->A = 0; // Next state is State-0
116 RetVal = 2;
117 }
118 else { // This isn't the last character, get more. State 1, Code 3 or 4
119 ps->A = 2;
120 RetVal = -2;
121 }
122 break;
123 case 2: // Third byte of the sequence
124 if(Length == 3) {
125 Length = ((ps->C[0] & 0x0f) << 12) + ((ps->C[1] & 0x3f) << 6) + (ps->C[2] & 0x3f);
126 assert ((Length > 0x07FF) && (Length <= 0xFFFF));
127 ps->B = ps->D[0] = (UINT16)Length;
128 ps->A = 0; // Next state is State-0
129 RetVal = 3;
130 }
131 else {
132 ps->A = 3;
133 RetVal = -2;
134 }
135 break;
136 case 3: // Fourth byte of the sequence
137 if(Length == 4) {
138 Length = ((ps->C[0] & 0x7) << 18) + ((ps->C[1] & 0x3f) << 12) +
139 ((ps->C[2] & 0x3f) << 6) + (ps->C[3] & 0x3f);
140 ps->B = Length;
141 assert ((Length > 0xFFFF) && (Length <= 0x10ffff));
142
143 /* compute and append the two surrogates: */
144
145 /* translate from 10000..10FFFF to 0..FFFF */
146 Length -= 0x10000;
147
148 /* high surrogate = top 10 bits added to D800 */
149 ps->D[0] = (UINT16)(0xD800 + (Length >> 10));
150
151 /* low surrogate = bottom 10 bits added to DC00 */
152 ps->D[1] = (UINT16)(0xDC00 + (Length & 0x03FF));
153 ps->A = 0; // Next state is State-0
154 RetVal = 4;
155 }
156 else {
157 errno = EILSEQ;
158 ps->A = 0;
159 RetVal = -1;
160 ps->E = 4; // Can't happen, but consume this character anyway
161 }
162 break;
163 }
164 }
165 else { // Invalid surrogate character
166 errno = EILSEQ;
167 ps->A = 0; // Next is State-0
168 RetVal = -1;
169 ps->E = 0; // Don't Consume, it may be an initial byte
170 }
171 }
172 return RetVal;
173 }
174
175 /** Convert one Multibyte sequence.
176
177 @param Dest
178 @param Src
179 @param Len
180 @param pS
181
182 @retval -2 Bytes processed comprise an incomplete, but potentially valid, character.
183 @retval -1 An encoding error was encountered. ps->E indicates the number of bytes consumed.
184 @retval 0 Either Src is NULL or it points to a NUL character.
185 @retval 1:N N bytes were consumed producing a valid wide character.
186 **/
187 int
188 DecodeOneStateful(
189 wchar_t *Dest, // Pointer to output location, or NULL
190 const char *Src, // Multibyte Source (UTF8)
191 ssize_t Len, // Max Number of bytes to convert
192 mbstate_t *pS // Pointer to State struct., or NULL
193 )
194 {
195 const char *SrcEnd;
196 int NumConv;
197 unsigned char ch;
198
199 if((Src == NULL) || (*Src == '\0')) {
200 return 0;
201 }
202 if(pS == NULL) {
203 pS = &LocalConvState;
204 }
205 SrcEnd = Src + Len;
206 NumConv = 0;
207 while(Src < SrcEnd) {
208 ch = (unsigned char)*Src++;
209 NumConv = ProcessOneByte(ch, pS);
210 if(NumConv != -2)
211 break;
212 }
213 if((NumConv > 0) && (Dest != NULL)) {
214 Dest[0] = pS->D[0];
215 if(NumConv == 4) {
216 Dest[1] = pS->D[1];
217 }
218 }
219 return NumConv;
220 }
221
222 /** Convert wide characters (UTF16) into multibyte characters (UTF8)
223
224 @param s Pointer to the wide-character string to convert
225 @param size Number of wide characters in s. size <= wcslen(s);
226
227 @return A newly allocated buffer containing the converted string is returned,
228 or NULL if an error occurred. Global variable errno contains more
229 information if NULL is returned.
230 **/
231 ssize_t
232 EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)
233 {
234 char *p; /* next free byte in build buffer */
235 char *v; /* next free byte in destination */
236 ssize_t nneeded; /* number of result bytes needed */
237 int i; /* index into s of next input byte */
238 int NumInBuff; // number of bytes in Buff
239 char Buff[4]; // Buffer into which each character is built
240
241 assert(s != NULL);
242 assert(size >= 0);
243
244 v = Dest;
245 nneeded = 0;
246 if((size * MB_LEN_MAX) / MB_LEN_MAX != size) {
247 // size is too large and resulted in overflow when multiplied by MB_LEN_MAX
248 errno = EINVAL;
249 return (ssize_t)-1;
250 }
251
252 for (i = 0; i < size;) {
253 ch_UCS4 ch = s[i++];
254 p = Buff;
255
256 if (ch < 0x80) {
257 /* Encode ASCII -- One Byte */
258 *p++ = (char) ch;
259 }
260 else if (ch < 0x0800) {
261 /* Encode Latin-1 -- Two Byte */
262 *p++ = (char)(0xc0 | (ch >> 6));
263 *p++ = (char)(0x80 | (ch & 0x3f));
264 }
265 else {
266 /* Encode UCS2 Unicode ordinals -- Three Byte */
267 /* Special case: check for high surrogate -- Shouldn't happen in UEFI */
268 if (0xD800 <= ch && ch <= 0xDBFF && i < size) {
269 ch_UCS4 ch2 = s[i];
270 /* Check for low surrogate and combine the two to
271 form a UCS4 value */
272 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
273 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
274 i++;
275 /* Encode UCS4 Unicode ordinals -- Four Byte */
276 *p++ = (char)(0xf0 | (ch >> 18));
277 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
278 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
279 *p++ = (char)(0x80 | (ch & 0x3f));
280 continue;
281 }
282 /* Fall through: handles isolated high surrogates */
283 }
284 *p++ = (char)(0xe0 | (ch >> 12));
285 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
286 *p++ = (char)(0x80 | (ch & 0x3f));
287 }
288 /* At this point, Buff holds the converted character which is NumInBuff bytes long.
289 NumInBuff is the value 1, 2, 3, or 4
290 */
291 NumInBuff = (int)(p - Buff); // Number of bytes in Buff
292 if(Dest != NULL) { // Save character if Dest is not NULL
293 memcpy(v, Buff, NumInBuff);
294 v += NumInBuff;
295 }
296 nneeded += NumInBuff; // Keep track of the number of bytes put into Dest
297 }
298 if(Dest != NULL) {
299 // Terminate the destination string.
300 *v = '\0';
301 }
302 return nneeded; // Tell the caller
303 }
304
305 // ######################## Narrow to Wide Conversions #######################
306
307 /** If ps is not a null pointer, the mbsinit function determines whether the
308 pointed-to mbstate_t object describes an initial conversion state.
309
310 @return The mbsinit function returns nonzero if ps is a null pointer
311 or if the pointed-to object describes an initial conversion
312 state; otherwise, it returns zero.
313
314 Declared in: wchar.h
315 **/
316 int
317 mbsinit(const mbstate_t *ps)
318 {
319 if((ps == NULL) || (ps->A == 0)) {
320 return 1;
321 }
322 return 0;
323 }
324
325 /** The mbrlen function is equivalent to the call:<BR>
326 @verbatim
327 mbrtowc(NULL, s, n, ps != NULL ? ps : &internal)
328 @endverbatim
329 where internal is the mbstate_t object for the mbrlen function, except that
330 the expression designated by ps is evaluated only once.
331
332 @return The mbrlen function returns a value between zero and n,
333 inclusive, (size_t)(-2), or (size_t)(-1).
334
335 Declared in: wchar.h
336 **/
337 size_t
338 mbrlen(
339 const char *s,
340 size_t n,
341 mbstate_t *ps
342 )
343 {
344 return mbrtowc(NULL, s, n, ps);
345 }
346
347 /** Determine the number of bytes comprising a multibyte character.
348
349 If S is not a null pointer, the mblen function determines the number of bytes
350 contained in the multibyte character pointed to by S. Except that the
351 conversion state of the mbtowc function is not affected, it is equivalent to
352 mbtowc((wchar_t *)0, S, N);
353
354 @param[in] S NULL to query whether multibyte characters have
355 state-dependent encodings. Otherwise, points to a
356 multibyte character.
357 @param[in] N The maximum number of bytes in a multibyte character.
358
359 @return If S is a null pointer, the mblen function returns a nonzero or
360 zero value, if multibyte character encodings, respectively, do
361 or do not have state-dependent encodings. If S is not a null
362 pointer, the mblen function either returns 0 (if S points to the
363 null character), or returns the number of bytes that are contained
364 in the multibyte character (if the next N or fewer bytes form a
365 valid multibyte character), or returns -1 (if they do not form a
366 valid multibyte character).
367
368 Declared in: stdlib.h
369 **/
370 int
371 mblen(
372 const char *s,
373 size_t n
374 )
375 {
376 return (int)mbrlen(s, n, NULL);
377 }
378
379 /**
380 If S is a null pointer, the mbrtowc function is equivalent to the call:<BR>
381 @verbatim
382 mbrtowc(NULL, "", 1, ps)
383 @endverbatim
384
385 In this case, the values of the parameters pwc and n are ignored.
386
387 If S is not a null pointer, the mbrtowc function inspects at most n bytes beginning with
388 the byte pointed to by S to determine the number of bytes needed to complete the next
389 multibyte character (including any shift sequences). If the function determines that the
390 next multibyte character is complete and valid, it determines the value of the
391 corresponding wide character and then, if pwc is not a null pointer, stores that value in
392 the object pointed to by pwc. If the corresponding wide character is the null wide
393 character, the resulting state described is the initial conversion state.
394
395 @retval 0 if the next n or fewer bytes complete the multibyte
396 character that corresponds to the null wide
397 character (which is the value stored).
398 @retval between_1_and_n_inclusive if the next n or fewer bytes complete
399 a valid multibyte character (which is the value
400 stored); the value returned is the number of bytes
401 that complete the multibyte character.
402 @retval (size_t)(-2) if the next n bytes contribute to an incomplete
403 (but potentially valid) multibyte character, and
404 all n bytes have been processed (no value is stored).
405 @retval (size_t)(-1) if an encoding error occurs, in which case the next
406 n or fewer bytes do not contribute to a complete and
407 valid multibyte character (no value is stored); the
408 value of the macro EILSEQ is stored in errno, and
409 the conversion state is unspecified.
410
411 Declared in: wchar.h
412 **/
413 size_t
414 mbrtowc(
415 wchar_t *pwc,
416 const char *s,
417 size_t n,
418 mbstate_t *ps
419 )
420 {
421 int RetVal;
422
423 RetVal = DecodeOneStateful(pwc, s, (ssize_t)n, ps);
424 return (size_t)RetVal;
425 }
426
427 /** Convert a multibyte character into a wide character.
428
429 If S is not a null pointer, the mbtowc function inspects at most N bytes
430 beginning with the byte pointed to by S to determine the number of bytes
431 needed to complete the next multibyte character (including any shift
432 sequences). If the function determines that the next multibyte character
433 is complete and valid, it determines the value of the corresponding wide
434 character and then, if Pwc is not a null pointer, stores that value in
435 the object pointed to by Pwc. If the corresponding wide character is the
436 null wide character, the function is left in the initial conversion state.
437
438 @param[out] Pwc Pointer to a wide-character object to receive the converted character.
439 @param[in] S Pointer to a multibyte character to convert.
440 @param[in] N Maximum number of bytes in a multibyte character.
441
442 @return If S is a null pointer, the mbtowc function returns a nonzero or
443 zero value, if multibyte character encodings, respectively, do
444 or do not have state-dependent encodings. If S is not a null
445 pointer, the mbtowc function either returns 0 (if S points to
446 the null character), or returns the number of bytes that are
447 contained in the converted multibyte character (if the next N or
448 fewer bytes form a valid multibyte character), or returns -1
449 (if they do not form a valid multibyte character).
450
451 In no case will the value returned be greater than N or the value
452 of the MB_CUR_MAX macro.
453
454 Declared in: stdlib.h
455 **/
456 int
457 mbtowc(
458 wchar_t *pwc,
459 const char *s,
460 size_t n
461 )
462 {
463 return (int)mbrtowc(pwc, s, n, NULL);
464 }
465
466 /**
467 The mbsrtowcs function converts a sequence of multibyte characters that begins in the
468 conversion state described by the object pointed to by ps, from the array indirectly
469 pointed to by src into a sequence of corresponding wide characters. If dst is not a null
470 pointer, the converted characters are stored into the array pointed to by dst. Conversion
471 continues up to and including a terminating null character, which is also stored.
472 Conversion stops earlier in two cases: when a sequence of bytes is encountered that does
473 not form a valid multibyte character, or (if dst is not a null pointer) when len wide
474 characters have been stored into the array pointed to by dst. Each conversion takes
475 place as if by a call to the mbrtowc function.
476
477 If dst is not a null pointer, the pointer object pointed to by src is assigned either a null
478 pointer (if conversion stopped due to reaching a terminating null character) or the address
479 just past the last multibyte character converted (if any). If conversion stopped due to
480 reaching a terminating null character and if dst is not a null pointer, the resulting state
481 described is the initial conversion state.
482
483 @return If the input conversion encounters a sequence of bytes that do
484 not form a valid multibyte character, an encoding error occurs:
485 the mbsrtowcs function stores the value of the macro EILSEQ in
486 errno and returns (size_t)(-1); the conversion state is
487 unspecified. Otherwise, it returns the number of multibyte
488 characters successfully converted, not including the terminating
489 null character (if any).
490
491 Declared in: wchar.h
492 **/
493 size_t
494 mbsrtowcs(
495 wchar_t *dst,
496 const char **src,
497 size_t len,
498 mbstate_t *ps
499 )
500 {
501 int x;
502 size_t RetVal = 0;
503 const char *MySrc;
504
505 if((src == NULL) || (*src == NULL) || (**src == '\0')) {
506 return 0;
507 }
508
509 MySrc = *src;
510 for(x = 1 ; (len != 0) && (x > 0); --len) {
511 x = DecodeOneStateful(dst, MySrc, MB_LEN_MAX, ps);
512 switch(x) {
513 case -2: // Incomplete character
514 case -1: // Encoding error
515 RetVal = (size_t)x;
516 break;
517 case 0: // Encountered NUL character: done.
518 if(dst != NULL) {
519 *dst = 0;
520 *src = NULL;
521 }
522 break;
523 default: // Successfully decoded a character, continue with next
524 MySrc += x;
525 if(dst != NULL) {
526 ++dst;
527 if(x == 4) {
528 ++dst;
529 }
530 *src = MySrc;
531 }
532 ++RetVal;
533 break;
534 }
535 }
536 return RetVal;
537 }
538
539 /** Convert a multibyte character string into a wide-character string.
540
541 The mbstowcs function converts a sequence of multibyte characters that
542 begins in the initial shift state from the array pointed to by Src into
543 a sequence of corresponding wide characters and stores not more than limit
544 wide characters into the array pointed to by Dest. No multibyte
545 characters that follow a null character (which is converted into a null
546 wide character) will be examined or converted. Each multibyte character
547 is converted as if by a call to the mbtowc function, except that the
548 conversion state of the mbtowc function is not affected.
549
550 No more than Limit elements will be modified in the array pointed to by Dest.
551 If copying takes place between objects that overlap,
552 the behavior is undefined.
553
554 @param[out] Dest Pointer to the array to receive the converted string.
555 @param[in] Src Pointer to the string to be converted.
556 @param[in] Limit Maximum number of elements to be written to Dest.
557
558 @return If an invalid multibyte character is encountered, the mbstowcs
559 function returns (size_t)(-1). Otherwise, the mbstowcs function
560 returns the number of array elements modified, not including a
561 terminating null wide character, if any.
562
563 Declared in: stdlib.h
564 **/
565 size_t
566 mbstowcs(
567 wchar_t *pwcs,
568 const char *s,
569 size_t n
570 )
571 {
572
573 /* pwcs may be NULL */
574 /* s may be NULL */
575
576 return mbsrtowcs(pwcs, &s, n, NULL);
577 }
578
579 /** The btowc function determines whether C constitutes a valid single-byte
580 character in the initial shift state.
581
582 @return The btowc function returns WEOF if c has the value EOF or if
583 (unsigned char)C does not constitute a valid single-byte
584 character in the initial shift state. Otherwise, it returns the
585 wide character representation of that character.
586
587 Declared in: wchar.h
588 **/
589 wint_t
590 btowc(int c)
591 {
592 int x;
593 wchar_t Dest;
594 wint_t RetVal = WEOF;
595
596 if (c == EOF)
597 return WEOF;
598 x = DecodeOneStateful(&Dest, (const char *)&c, 1, NULL);
599 if(x == 0) {
600 RetVal = 0;
601 }
602 else if(x == 1) {
603 RetVal = (wint_t)Dest;
604 }
605 return RetVal;
606 }
607
608 // ######################## Wide to Narrow Conversions #######################
609
610 /**
611 If S is a null pointer, the wcrtomb function is equivalent to the call:<BR>
612 @verbatim
613 wcrtomb(buf, L'\0', ps)
614 @endverbatim
615 where buf is an internal buffer.
616
617 If S is not a null pointer, the wcrtomb function determines the number of bytes needed
618 to represent the multibyte character that corresponds to the wide character given by wc
619 (including any shift sequences), and stores the multibyte character representation in the
620 array whose first element is pointed to by S. At most MB_CUR_MAX bytes are stored. If
621 wc is a null wide character, a null byte is stored, preceded by any shift sequence needed
622 to restore the initial shift state; the resulting state described is the initial conversion state.
623
624 @return The wcrtomb function returns the number of bytes stored in the
625 array object (including any shift sequences). When wc is not a
626 valid wide character, an encoding error occurs: the function
627 stores the value of the macro EILSEQ in errno and
628 returns (size_t)(-1); the conversion state is unspecified.
629
630 Declared in: wchar.h
631 **/
632 size_t
633 wcrtomb(
634 char *s,
635 wchar_t wchar,
636 mbstate_t *ps
637 )
638 {
639 size_t RetVal;
640
641 /* s may be NULL */
642 if (s == NULL) {
643 RetVal = 1;
644 }
645 else {
646 if (wchar == L'\0') {
647 *s = '\0';
648 RetVal = 1;
649 }
650 else {
651 RetVal = EncodeUtf8(s, &wchar, 1);
652 }
653 }
654 return RetVal;
655 }
656
657 /** Convert a wide character into a multibyte character.
658
659 The wctomb function determines the number of bytes needed to represent the
660 multibyte character corresponding to the wide character given by WC
661 (including any shift sequences), and stores the multibyte character
662 representation in the array whose first element is pointed to by S (if S is
663 not a null pointer). At most MB_CUR_MAX characters are stored. If WC is a
664 null wide character, a null byte is stored, preceded by any shift sequence
665 needed to restore the initial shift state, and the function is left in the
666 initial conversion state.
667
668 @param[out] S Pointer to the object to receive the converted multibyte character.
669 @param[in] WC Wide character to be converted.
670
671 @return If S is a null pointer, the wctomb function returns a nonzero or
672 zero value, if multibyte character encodings, respectively, do or
673 do not have state-dependent encodings. If S is not a null pointer,
674 the wctomb function returns -1 if the value of WC does not
675 correspond to a valid multibyte character, or returns the number
676 of bytes that are contained in the multibyte character
677 corresponding to the value of WC.
678
679 In no case will the value returned be greater than the value of
680 the MB_CUR_MAX macro.
681
682 Declared in: stdlib.h
683 **/
684 int
685 wctomb(
686 char *s,
687 wchar_t wchar
688 )
689 {
690 /*
691 If s is NULL just return whether MB Characters have state
692 dependent encodings -- they don't.
693 */
694 if (s == NULL)
695 return 0;
696
697 return (int)wcrtomb(s, wchar, NULL);
698 }
699
700 /** The wcsrtombs function converts a sequence of wide characters from the array
701 indirectly pointed to by S into a sequence of corresponding multibyte
702 characters that begins in the conversion state described by the object
703 pointed to by ps.
704
705 If S is not a null pointer, the converted characters
706 are then stored into the array pointed to by S. Conversion continues
707 up to and including a terminating null wide character, which is also
708 stored. Conversion stops earlier in two cases: when a wide character is
709 reached that does not correspond to a valid multibyte character, or
710 (if S is not a null pointer) when the next multibyte character would
711 exceed the limit of N total bytes to be stored into the array pointed
712 to by S. Each conversion takes place as if by a call to the wcrtomb
713 function.)
714
715 If S is not a null pointer, the pointer object pointed to by pwcs is
716 assigned either a null pointer (if conversion stopped due to reaching
717 a terminating null wide character) or the address just past the last wide
718 character converted (if any). If conversion stopped due to reaching a
719 terminating null wide character, the resulting state described is the
720 initial conversion state.
721
722 @return If conversion stops because a wide character is reached that
723 does not correspond to a valid multibyte character, an
724 encoding error occurs: the wcsrtombs function stores the
725 value of the macro EILSEQ in errno and returns (size_t)(-1);
726 the conversion state is unspecified. Otherwise, it returns
727 the number of bytes in the resulting multibyte character
728 sequence, not including the terminating null character (if any).
729
730 Declared in: wchar.h
731 **/
732 size_t
733 wcsrtombs(
734 char *s,
735 const wchar_t **pwcs,
736 size_t n,
737 mbstate_t *ps
738 )
739 {
740 int count = 0;
741
742 /* s may be NULL */
743 /* pwcs may be NULL */
744 /* ps appears to be unused */
745
746 if (pwcs == NULL || *pwcs == NULL)
747 return (0);
748
749 if (s == NULL) {
750 while (*(*pwcs)++ != 0)
751 count++;
752 return(count);
753 }
754
755 if (n != 0) {
756 do {
757 if ((*s++ = (char) *(*pwcs)++) == 0) {
758 *pwcs = NULL;
759 break;
760 }
761 count++;
762 } while (--n != 0);
763 }
764
765 return count;
766 }
767
768 /** Convert a wide-character string into a multibyte character string.
769
770 The wcstombs function converts a sequence of wide characters from the
771 array pointed to by Src into a sequence of corresponding multibyte
772 characters that begins in the initial shift state, and stores these
773 multibyte characters into the array pointed to by Dest, stopping if a
774 multibyte character would exceed the limit of Limit total bytes or if a
775 null character is stored. Each wide character is converted as if by
776 a call to the wctomb function, except that the conversion state of
777 the wctomb function is not affected.
778
779 No more than Limit bytes will be modified in the array pointed to by Dest.
780 If copying takes place between objects that overlap,
781 the behavior is undefined.
782
783 @param[out] Dest Pointer to the array to receive the converted string.
784 @param[in] Src Pointer to the string to be converted.
785 @param[in] Limit Maximum number of elements to be written to Dest.
786
787 @return If a wide character is encountered that does not correspond to a
788 valid multibyte character, the wcstombs function returns
789 (size_t)(-1). Otherwise, the wcstombs function returns the number
790 of bytes modified, not including a terminating null character,
791 if any.
792
793 Declared in: stdlib.h
794 **/
795 size_t
796 wcstombs(
797 char *s,
798 const wchar_t *pwcs,
799 size_t n
800 )
801 {
802 /* s may be NULL */
803 return wcsrtombs(s, &pwcs, n, NULL);
804 }
805
806 /** The wctob function determines whether C corresponds to a member of the extended
807 character set whose multibyte character representation is a single byte when in the initial
808 shift state.
809
810 @return The wctob function returns EOF if C does not correspond to a multibyte
811 character with length one in the initial shift state. Otherwise, it
812 returns the single-byte representation of that character as an
813 unsigned char converted to an int.
814
815 Declared in: wchar.h
816 **/
817 int
818 wctob(wint_t c)
819 {
820 /* wctob needs to be consistent with wcrtomb.
821 if wcrtomb says that a character is representable in 1 byte,
822 which this implementation always says, then wctob needs to
823 also represent the character as 1 byte.
824 */
825 if (c == WEOF) {
826 return EOF;
827 }
828 return (int)(c & 0xFF);
829 }