]> git.proxmox.com Git - mirror_edk2.git/blame - StdLib/LibC/Locale/multibyte_Utf8.c
UnixPkg: Mark as deprecated (replaced by EmulatorPkg)
[mirror_edk2.git] / StdLib / LibC / Locale / multibyte_Utf8.c
CommitLineData
a7a8363d 1/** @file\r
2 Copyright (c) 2012, Intel Corporation. All rights reserved.<BR>\r
3 This program and the accompanying materials\r
4 are licensed and made available under the terms and conditions of the BSD License\r
5 which accompanies this distribution. The full text of the license may be found at\r
6 http://opensource.org/licenses/bsd-license.php\r
7\r
8 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r
9 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r
10**/\r
11#include <assert.h>\r
12#include <string.h>\r
13#include <errno.h>\r
14#include <stdlib.h>\r
15#include <wchar.h>\r
16#include <sys/types.h>\r
17\r
18typedef int ch_UCS4;\r
19\r
20static mbstate_t LocalConvState = {0};\r
21\r
22/** Map a UTF-8 encoded prefix byte to a sequence length.\r
23 Zero means illegal prefix, but valid surrogate if < 0xC0.\r
24 One indicates an ASCII-7 equivalent character.\r
25 Two, three, and four are the first byte for 2, 3, and 4 byte sequences, respectively.\r
26 See RFC 3629 for details.\r
27\r
28 TABLE ENCODING:\r
29 Low Nibble decodes the first byte into the number of bytes in the sequence.\r
30 A value of zero indicates an invalid byte.\r
31 The High Nibble encodes a bit mask to be used to match against the high nibble of the second byte.\r
32\r
33 example:\r
34 SequenceLength = code[c0] & 0x0F;\r
35 Mask = 0x80 | code[c0];\r
36\r
37 Surrogate bytes are valid if: code[cX] & Mask > 0x80;\r
38\r
39*/\r
40static\r
41UINT8 utf8_code_length[256] = {\r
42 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 00-0F */\r
43 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
44 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
45 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
46 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
47 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
48 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r
49 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 70-7F */\r
50 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, /* 80-8F */\r
51 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, /* 90-9F */\r
52 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* A0-AF */\r
53 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* B0-BF */\r
54 0x00, 0x00, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* C0-C1 + C2-CF */\r
55 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* D0-DF */\r
56 0x43, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x33, 0x73, 0x73, /* E0-EF */\r
57 0x64, 0x74, 0x74, 0x74, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* F0-F4 + F5-FF */\r
58};\r
59\r
60/** Process one byte of a multibyte character.\r
61\r
62 @param ch\r
63 @param ps\r
64\r
65 @retval -2\r
66 @retval -1\r
67 @retval 1:4\r
68**/\r
69static\r
70int\r
71ProcessOneByte(unsigned char ch, mbstate_t *ps)\r
72{\r
73 UINT32 Mask;\r
74 UINT32 Length;\r
75 int RetVal = 0;\r
76\r
77 if(ps->A > 3) {\r
78 // We are in an invalid state\r
79 ps->A = 0; // Initial State\r
80 }\r
81 ps->C[ps->A] = ch; // Save the current character\r
82 Mask = utf8_code_length[ch];\r
83\r
84 if(ps->A == 0) { // Initial State. First byte of sequence.\r
85 ps->E = Mask | 0x80;\r
86 Length = Mask & 0xF;\r
87 switch(Length) {\r
88 case 0: // State 0, Code 0\r
89 errno = EILSEQ;\r
90 RetVal = -1;\r
91 ps->E = 1; // Consume this character\r
92 break;\r
93 case 1: // State 0, Code 1\r
94 // ASCII-7 Character\r
95 ps->B = ps->D[0] = ch;\r
96 RetVal = 1;\r
97 break;\r
98 default: // State 0, Code 2, 3, 4\r
99 ps->A = 1; // Next state is State-1\r
100 RetVal = -2; // Incomplete but potentially valid character\r
101 break;\r
102 }\r
103 }\r
104 else {\r
105 // We are in state 1, 2, or 3 and processing a surrogate byte\r
106 Length = ps->E & 0xF;\r
107 if((Mask & ps->E) > 0x80) {\r
108 // This byte is valid\r
109 switch(ps->A) { // Process based upon our current state\r
110 case 1: // Second byte of the sequence.\r
111 if(Length == 2) { // State 1, Code 2\r
112 Length = ((ps->C[0] & 0x1f) << 6) + (ps->C[1] & 0x3f);\r
113 assert ((Length > 0x007F) && (Length <= 0x07FF));\r
114 ps->B = ps->D[0] = (UINT16)Length;\r
115 ps->A = 0; // Next state is State-0\r
116 RetVal = 2;\r
117 }\r
118 else { // This isn't the last character, get more. State 1, Code 3 or 4\r
119 ps->A = 2;\r
120 RetVal = -2;\r
121 }\r
122 break;\r
123 case 2: // Third byte of the sequence\r
124 if(Length == 3) {\r
125 Length = ((ps->C[0] & 0x0f) << 12) + ((ps->C[1] & 0x3f) << 6) + (ps->C[2] & 0x3f);\r
126 assert ((Length > 0x07FF) && (Length <= 0xFFFF));\r
127 ps->B = ps->D[0] = (UINT16)Length;\r
128 ps->A = 0; // Next state is State-0\r
129 RetVal = 3;\r
130 }\r
131 else {\r
132 ps->A = 3;\r
133 RetVal = -2;\r
134 }\r
135 break;\r
136 case 3: // Fourth byte of the sequence\r
137 if(Length == 4) {\r
138 Length = ((ps->C[0] & 0x7) << 18) + ((ps->C[1] & 0x3f) << 12) +\r
139 ((ps->C[2] & 0x3f) << 6) + (ps->C[3] & 0x3f);\r
140 ps->B = Length;\r
141 assert ((Length > 0xFFFF) && (Length <= 0x10ffff));\r
142\r
143 /* compute and append the two surrogates: */\r
144\r
145 /* translate from 10000..10FFFF to 0..FFFF */\r
146 Length -= 0x10000;\r
147\r
148 /* high surrogate = top 10 bits added to D800 */\r
149 ps->D[0] = (UINT16)(0xD800 + (Length >> 10));\r
150\r
151 /* low surrogate = bottom 10 bits added to DC00 */\r
152 ps->D[1] = (UINT16)(0xDC00 + (Length & 0x03FF));\r
153 ps->A = 0; // Next state is State-0\r
154 RetVal = 4;\r
155 }\r
156 else {\r
157 errno = EILSEQ;\r
158 ps->A = 0;\r
159 RetVal = -1;\r
160 ps->E = 4; // Can't happen, but consume this character anyway\r
161 }\r
162 break;\r
163 }\r
164 }\r
165 else { // Invalid surrogate character\r
166 errno = EILSEQ;\r
167 ps->A = 0; // Next is State-0\r
168 RetVal = -1;\r
169 ps->E = 0; // Don't Consume, it may be an initial byte\r
170 }\r
171 }\r
172 return RetVal;\r
173}\r
174\r
175/** Convert one Multibyte sequence.\r
176\r
177 @param Dest\r
178 @param Src\r
179 @param Len\r
180 @param pS\r
181\r
182 @retval -2 Bytes processed comprise an incomplete, but potentially valid, character.\r
183 @retval -1 An encoding error was encountered. ps->E indicates the number of bytes consumed.\r
184 @retval 0 Either Src is NULL or it points to a NUL character.\r
185 @retval 1:N N bytes were consumed producing a valid wide character.\r
186**/\r
187int\r
188DecodeOneStateful(\r
189 wchar_t *Dest, // Pointer to output location, or NULL\r
190 const char *Src, // Multibyte Source (UTF8)\r
191 ssize_t Len, // Max Number of bytes to convert\r
192 mbstate_t *pS // Pointer to State struct., or NULL\r
193 )\r
194{\r
195 const char *SrcEnd;\r
196 int NumConv;\r
197 unsigned char ch;\r
198\r
199 if((Src == NULL) || (*Src == '\0')) {\r
200 return 0;\r
201 }\r
202 if(pS == NULL) {\r
203 pS = &LocalConvState;\r
204 }\r
205 SrcEnd = Src + Len;\r
206 NumConv = 0;\r
207 while(Src < SrcEnd) {\r
208 ch = (unsigned char)*Src++;\r
209 NumConv = ProcessOneByte(ch, pS);\r
210 if(NumConv != -2)\r
211 break;\r
212 }\r
213 if((NumConv > 0) && (Dest != NULL)) {\r
214 Dest[0] = pS->D[0];\r
215 if(NumConv == 4) {\r
216 Dest[1] = pS->D[1];\r
217 }\r
218 }\r
219 return NumConv;\r
220}\r
221\r
222/** Convert wide characters (UTF16) into multibyte characters (UTF8)\r
223\r
224 @param s Pointer to the wide-character string to convert\r
225 @param size Number of wide characters in s. size <= wcslen(s);\r
226\r
227 @return A newly allocated buffer containing the converted string is returned,\r
228 or NULL if an error occurred. Global variable errno contains more\r
229 information if NULL is returned.\r
230**/\r
231ssize_t\r
232EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)\r
233{\r
234 char *p; /* next free byte in build buffer */\r
235 char *v; /* next free byte in destination */\r
236 ssize_t nneeded; /* number of result bytes needed */\r
237 int i; /* index into s of next input byte */\r
238 int NumInBuff; // number of bytes in Buff\r
239 char Buff[4]; // Buffer into which each character is built\r
240\r
241 assert(s != NULL);\r
242 assert(size >= 0);\r
243\r
244 v = Dest;\r
245 nneeded = 0;\r
246 if((size * MB_LEN_MAX) / MB_LEN_MAX != size) {\r
247 // size is too large and resulted in overflow when multiplied by MB_LEN_MAX\r
248 errno = EINVAL;\r
249 return (ssize_t)-1;\r
250 }\r
251\r
252 for (i = 0; i < size;) {\r
253 ch_UCS4 ch = s[i++];\r
254 p = Buff;\r
255\r
256 if (ch < 0x80) {\r
257 /* Encode ASCII -- One Byte */\r
258 *p++ = (char) ch;\r
259 }\r
260 else if (ch < 0x0800) {\r
261 /* Encode Latin-1 -- Two Byte */\r
262 *p++ = (char)(0xc0 | (ch >> 6));\r
263 *p++ = (char)(0x80 | (ch & 0x3f));\r
264 }\r
265 else {\r
266 /* Encode UCS2 Unicode ordinals -- Three Byte */\r
267 /* Special case: check for high surrogate -- Shouldn't happen in UEFI */\r
268 if (0xD800 <= ch && ch <= 0xDBFF && i < size) {\r
269 ch_UCS4 ch2 = s[i];\r
270 /* Check for low surrogate and combine the two to\r
271 form a UCS4 value */\r
272 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r
273 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;\r
274 i++;\r
275 /* Encode UCS4 Unicode ordinals -- Four Byte */\r
276 *p++ = (char)(0xf0 | (ch >> 18));\r
277 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));\r
278 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r
279 *p++ = (char)(0x80 | (ch & 0x3f));\r
280 continue;\r
281 }\r
282 /* Fall through: handles isolated high surrogates */\r
283 }\r
284 *p++ = (char)(0xe0 | (ch >> 12));\r
285 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r
286 *p++ = (char)(0x80 | (ch & 0x3f));\r
287 }\r
288 /* At this point, Buff holds the converted character which is NumInBuff bytes long.\r
289 NumInBuff is the value 1, 2, 3, or 4\r
290 */\r
291 NumInBuff = (int)(p - Buff); // Number of bytes in Buff\r
292 if(Dest != NULL) { // Save character if Dest is not NULL\r
293 memcpy(v, Buff, NumInBuff);\r
294 v += NumInBuff;\r
295 }\r
296 nneeded += NumInBuff; // Keep track of the number of bytes put into Dest\r
297 }\r
298 if(Dest != NULL) {\r
299 // Terminate the destination string.\r
300 *v = '\0';\r
301 }\r
302 return nneeded; // Tell the caller\r
303}\r
304\r
305// ######################## Narrow to Wide Conversions #######################\r
306\r
307/** If ps is not a null pointer, the mbsinit function determines whether the\r
308 pointed-to mbstate_t object describes an initial conversion state.\r
309\r
310 @return The mbsinit function returns nonzero if ps is a null pointer\r
311 or if the pointed-to object describes an initial conversion\r
312 state; otherwise, it returns zero.\r
313\r
314 Declared in: wchar.h\r
315**/\r
316int\r
317mbsinit(const mbstate_t *ps)\r
318{\r
319 if((ps == NULL) || (ps->A == 0)) {\r
320 return 1;\r
321 }\r
322 return 0;\r
323}\r
324\r
325/** The mbrlen function is equivalent to the call:<BR>\r
326@verbatim\r
327 mbrtowc(NULL, s, n, ps != NULL ? ps : &internal)\r
328@endverbatim\r
329 where internal is the mbstate_t object for the mbrlen function, except that\r
330 the expression designated by ps is evaluated only once.\r
331\r
332 @return The mbrlen function returns a value between zero and n,\r
333 inclusive, (size_t)(-2), or (size_t)(-1).\r
334\r
335 Declared in: wchar.h\r
336**/\r
337size_t\r
338mbrlen(\r
339 const char *s,\r
340 size_t n,\r
341 mbstate_t *ps\r
342 )\r
343{\r
344 return mbrtowc(NULL, s, n, ps);\r
345}\r
346\r
347/** Determine the number of bytes comprising a multibyte character.\r
348\r
349 If S is not a null pointer, the mblen function determines the number of bytes\r
350 contained in the multibyte character pointed to by S. Except that the\r
351 conversion state of the mbtowc function is not affected, it is equivalent to\r
352 mbtowc((wchar_t *)0, S, N);\r
353\r
354 @param[in] S NULL to query whether multibyte characters have\r
355 state-dependent encodings. Otherwise, points to a\r
356 multibyte character.\r
357 @param[in] N The maximum number of bytes in a multibyte character.\r
358\r
359 @return If S is a null pointer, the mblen function returns a nonzero or\r
360 zero value, if multibyte character encodings, respectively, do\r
361 or do not have state-dependent encodings. If S is not a null\r
362 pointer, the mblen function either returns 0 (if S points to the\r
363 null character), or returns the number of bytes that are contained\r
364 in the multibyte character (if the next N or fewer bytes form a\r
365 valid multibyte character), or returns -1 (if they do not form a\r
366 valid multibyte character).\r
367\r
368 Declared in: stdlib.h\r
369**/\r
370int\r
371mblen(\r
372 const char *s,\r
373 size_t n\r
374 )\r
375{\r
376 return (int)mbrlen(s, n, NULL);\r
377}\r
378\r
379/**\r
380If S is a null pointer, the mbrtowc function is equivalent to the call:<BR>\r
381@verbatim\r
382 mbrtowc(NULL, "", 1, ps)\r
383@endverbatim\r
384\r
385In this case, the values of the parameters pwc and n are ignored.\r
386\r
387If S is not a null pointer, the mbrtowc function inspects at most n bytes beginning with\r
388the byte pointed to by S to determine the number of bytes needed to complete the next\r
389multibyte character (including any shift sequences). If the function determines that the\r
390next multibyte character is complete and valid, it determines the value of the\r
391corresponding wide character and then, if pwc is not a null pointer, stores that value in\r
392the object pointed to by pwc. If the corresponding wide character is the null wide\r
393character, the resulting state described is the initial conversion state.\r
394\r
395 @retval 0 if the next n or fewer bytes complete the multibyte\r
396 character that corresponds to the null wide\r
397 character (which is the value stored).\r
398 @retval between_1_and_n_inclusive if the next n or fewer bytes complete\r
399 a valid multibyte character (which is the value\r
400 stored); the value returned is the number of bytes\r
401 that complete the multibyte character.\r
402 @retval (size_t)(-2) if the next n bytes contribute to an incomplete\r
403 (but potentially valid) multibyte character, and\r
404 all n bytes have been processed (no value is stored).\r
405 @retval (size_t)(-1) if an encoding error occurs, in which case the next\r
406 n or fewer bytes do not contribute to a complete and\r
407 valid multibyte character (no value is stored); the\r
408 value of the macro EILSEQ is stored in errno, and\r
409 the conversion state is unspecified.\r
410\r
411 Declared in: wchar.h\r
412**/\r
413size_t\r
414mbrtowc(\r
415 wchar_t *pwc,\r
416 const char *s,\r
417 size_t n,\r
418 mbstate_t *ps\r
419 )\r
420{\r
421 int RetVal;\r
422\r
423 RetVal = DecodeOneStateful(pwc, s, (ssize_t)n, ps);\r
424 return (size_t)RetVal;\r
425}\r
426\r
427/** Convert a multibyte character into a wide character.\r
428\r
429 If S is not a null pointer, the mbtowc function inspects at most N bytes\r
430 beginning with the byte pointed to by S to determine the number of bytes\r
431 needed to complete the next multibyte character (including any shift\r
432 sequences). If the function determines that the next multibyte character\r
433 is complete and valid, it determines the value of the corresponding wide\r
434 character and then, if Pwc is not a null pointer, stores that value in\r
435 the object pointed to by Pwc. If the corresponding wide character is the\r
436 null wide character, the function is left in the initial conversion state.\r
437\r
438 @param[out] Pwc Pointer to a wide-character object to receive the converted character.\r
439 @param[in] S Pointer to a multibyte character to convert.\r
440 @param[in] N Maximum number of bytes in a multibyte character.\r
441\r
442 @return If S is a null pointer, the mbtowc function returns a nonzero or\r
443 zero value, if multibyte character encodings, respectively, do\r
444 or do not have state-dependent encodings. If S is not a null\r
445 pointer, the mbtowc function either returns 0 (if S points to\r
446 the null character), or returns the number of bytes that are\r
447 contained in the converted multibyte character (if the next N or\r
448 fewer bytes form a valid multibyte character), or returns -1\r
449 (if they do not form a valid multibyte character).\r
450\r
451 In no case will the value returned be greater than N or the value\r
452 of the MB_CUR_MAX macro.\r
453\r
454 Declared in: stdlib.h\r
455**/\r
456int\r
457mbtowc(\r
458 wchar_t *pwc,\r
459 const char *s,\r
460 size_t n\r
461 )\r
462{\r
463 return (int)mbrtowc(pwc, s, n, NULL);\r
464}\r
465\r
466/**\r
467The mbsrtowcs function converts a sequence of multibyte characters that begins in the\r
468conversion state described by the object pointed to by ps, from the array indirectly\r
469pointed to by src into a sequence of corresponding wide characters. If dst is not a null\r
470pointer, the converted characters are stored into the array pointed to by dst. Conversion\r
471continues up to and including a terminating null character, which is also stored.\r
472Conversion stops earlier in two cases: when a sequence of bytes is encountered that does\r
473not form a valid multibyte character, or (if dst is not a null pointer) when len wide\r
474characters have been stored into the array pointed to by dst. Each conversion takes\r
475place as if by a call to the mbrtowc function.\r
476\r
477If dst is not a null pointer, the pointer object pointed to by src is assigned either a null\r
478pointer (if conversion stopped due to reaching a terminating null character) or the address\r
479just past the last multibyte character converted (if any). If conversion stopped due to\r
480reaching a terminating null character and if dst is not a null pointer, the resulting state\r
481described is the initial conversion state.\r
482\r
483 @return If the input conversion encounters a sequence of bytes that do\r
484 not form a valid multibyte character, an encoding error occurs:\r
485 the mbsrtowcs function stores the value of the macro EILSEQ in\r
486 errno and returns (size_t)(-1); the conversion state is\r
487 unspecified. Otherwise, it returns the number of multibyte\r
488 characters successfully converted, not including the terminating\r
489 null character (if any).\r
490\r
491 Declared in: wchar.h\r
492**/\r
493size_t\r
494mbsrtowcs(\r
495 wchar_t *dst,\r
496 const char **src,\r
497 size_t len,\r
498 mbstate_t *ps\r
499 )\r
500{\r
501 int x;\r
502 size_t RetVal = 0;\r
503 const char *MySrc;\r
504\r
505 if((src == NULL) || (*src == NULL) || (**src == '\0')) {\r
506 return 0;\r
507 }\r
508\r
509 MySrc = *src;\r
510 for(x = 1 ; (len != 0) && (x > 0); --len) {\r
511 x = DecodeOneStateful(dst, MySrc, MB_LEN_MAX, ps);\r
512 switch(x) {\r
513 case -2: // Incomplete character\r
514 case -1: // Encoding error\r
515 RetVal = (size_t)x;\r
516 break;\r
517 case 0: // Encountered NUL character: done.\r
518 if(dst != NULL) {\r
519 *dst = 0;\r
520 *src = NULL;\r
521 }\r
522 break;\r
523 default: // Successfully decoded a character, continue with next\r
524 MySrc += x;\r
525 if(dst != NULL) {\r
526 ++dst;\r
527 if(x == 4) {\r
528 ++dst;\r
529 }\r
530 *src = MySrc;\r
531 }\r
532 ++RetVal;\r
533 break;\r
534 }\r
535 }\r
536 return RetVal;\r
537}\r
538\r
539/** Convert a multibyte character string into a wide-character string.\r
540\r
541 The mbstowcs function converts a sequence of multibyte characters that\r
542 begins in the initial shift state from the array pointed to by Src into\r
543 a sequence of corresponding wide characters and stores not more than limit\r
544 wide characters into the array pointed to by Dest. No multibyte\r
545 characters that follow a null character (which is converted into a null\r
546 wide character) will be examined or converted. Each multibyte character\r
547 is converted as if by a call to the mbtowc function, except that the\r
548 conversion state of the mbtowc function is not affected.\r
549\r
550 No more than Limit elements will be modified in the array pointed to by Dest.\r
551 If copying takes place between objects that overlap,\r
552 the behavior is undefined.\r
553\r
554 @param[out] Dest Pointer to the array to receive the converted string.\r
555 @param[in] Src Pointer to the string to be converted.\r
556 @param[in] Limit Maximum number of elements to be written to Dest.\r
557\r
558 @return If an invalid multibyte character is encountered, the mbstowcs\r
559 function returns (size_t)(-1). Otherwise, the mbstowcs function\r
560 returns the number of array elements modified, not including a\r
561 terminating null wide character, if any.\r
562\r
563 Declared in: stdlib.h\r
564**/\r
565size_t\r
566mbstowcs(\r
567 wchar_t *pwcs,\r
568 const char *s,\r
569 size_t n\r
570 )\r
571{\r
572\r
573 /* pwcs may be NULL */\r
574 /* s may be NULL */\r
575\r
576 return mbsrtowcs(pwcs, &s, n, NULL);\r
577}\r
578\r
579/** The btowc function determines whether C constitutes a valid single-byte\r
580 character in the initial shift state.\r
581\r
582 @return The btowc function returns WEOF if c has the value EOF or if\r
583 (unsigned char)C does not constitute a valid single-byte\r
584 character in the initial shift state. Otherwise, it returns the\r
585 wide character representation of that character.\r
586\r
587 Declared in: wchar.h\r
588**/\r
589wint_t\r
590btowc(int c)\r
591{\r
592 int x;\r
593 wchar_t Dest;\r
594 wint_t RetVal = WEOF;\r
595\r
596 if (c == EOF)\r
597 return WEOF;\r
598 x = DecodeOneStateful(&Dest, (const char *)&c, 1, NULL);\r
599 if(x == 0) {\r
600 RetVal = 0;\r
601 }\r
602 else if(x == 1) {\r
603 RetVal = (wint_t)Dest;\r
604 }\r
605 return RetVal;\r
606}\r
607\r
608// ######################## Wide to Narrow Conversions #######################\r
609\r
610/**\r
611If S is a null pointer, the wcrtomb function is equivalent to the call:<BR>\r
612@verbatim\r
613 wcrtomb(buf, L'\0', ps)\r
614@endverbatim\r
615where buf is an internal buffer.\r
616\r
617If S is not a null pointer, the wcrtomb function determines the number of bytes needed\r
618to represent the multibyte character that corresponds to the wide character given by wc\r
619(including any shift sequences), and stores the multibyte character representation in the\r
620array whose first element is pointed to by S. At most MB_CUR_MAX bytes are stored. If\r
621wc is a null wide character, a null byte is stored, preceded by any shift sequence needed\r
622to restore the initial shift state; the resulting state described is the initial conversion state.\r
623\r
624 @return The wcrtomb function returns the number of bytes stored in the\r
625 array object (including any shift sequences). When wc is not a\r
626 valid wide character, an encoding error occurs: the function\r
627 stores the value of the macro EILSEQ in errno and\r
628 returns (size_t)(-1); the conversion state is unspecified.\r
629\r
630 Declared in: wchar.h\r
631**/\r
632size_t\r
633wcrtomb(\r
634 char *s,\r
635 wchar_t wchar,\r
636 mbstate_t *ps\r
637 )\r
638{\r
639 size_t RetVal;\r
640\r
641 /* s may be NULL */\r
642 if (s == NULL) {\r
643 RetVal = 1;\r
644 }\r
645 else {\r
646 if (wchar == L'\0') {\r
647 *s = '\0';\r
648 RetVal = 1;\r
649 }\r
650 else {\r
651 RetVal = EncodeUtf8(s, &wchar, 1);\r
652 }\r
653 }\r
654 return RetVal;\r
655}\r
656\r
657/** Convert a wide character into a multibyte character.\r
658\r
659 The wctomb function determines the number of bytes needed to represent the\r
660 multibyte character corresponding to the wide character given by WC\r
661 (including any shift sequences), and stores the multibyte character\r
662 representation in the array whose first element is pointed to by S (if S is\r
663 not a null pointer). At most MB_CUR_MAX characters are stored. If WC is a\r
664 null wide character, a null byte is stored, preceded by any shift sequence\r
665 needed to restore the initial shift state, and the function is left in the\r
666 initial conversion state.\r
667\r
668 @param[out] S Pointer to the object to receive the converted multibyte character.\r
669 @param[in] WC Wide character to be converted.\r
670\r
671 @return If S is a null pointer, the wctomb function returns a nonzero or\r
672 zero value, if multibyte character encodings, respectively, do or\r
673 do not have state-dependent encodings. If S is not a null pointer,\r
674 the wctomb function returns -1 if the value of WC does not\r
675 correspond to a valid multibyte character, or returns the number\r
676 of bytes that are contained in the multibyte character\r
677 corresponding to the value of WC.\r
678\r
679 In no case will the value returned be greater than the value of\r
680 the MB_CUR_MAX macro.\r
681\r
682 Declared in: stdlib.h\r
683**/\r
684int\r
685wctomb(\r
686 char *s,\r
687 wchar_t wchar\r
688 )\r
689{\r
690 /*\r
691 If s is NULL just return whether MB Characters have state\r
692 dependent encodings -- they don't.\r
693 */\r
694 if (s == NULL)\r
695 return 0;\r
696\r
697 return (int)wcrtomb(s, wchar, NULL);\r
698}\r
699\r
700/** The wcsrtombs function converts a sequence of wide characters from the array\r
701 indirectly pointed to by S into a sequence of corresponding multibyte\r
702 characters that begins in the conversion state described by the object\r
703 pointed to by ps.\r
704\r
705 If S is not a null pointer, the converted characters\r
706 are then stored into the array pointed to by S. Conversion continues\r
707 up to and including a terminating null wide character, which is also\r
708 stored. Conversion stops earlier in two cases: when a wide character is\r
709 reached that does not correspond to a valid multibyte character, or\r
710 (if S is not a null pointer) when the next multibyte character would\r
711 exceed the limit of N total bytes to be stored into the array pointed\r
712 to by S. Each conversion takes place as if by a call to the wcrtomb\r
713 function.)\r
714\r
715 If S is not a null pointer, the pointer object pointed to by pwcs is\r
716 assigned either a null pointer (if conversion stopped due to reaching\r
717 a terminating null wide character) or the address just past the last wide\r
718 character converted (if any). If conversion stopped due to reaching a\r
719 terminating null wide character, the resulting state described is the\r
720 initial conversion state.\r
721\r
722 @return If conversion stops because a wide character is reached that\r
723 does not correspond to a valid multibyte character, an\r
724 encoding error occurs: the wcsrtombs function stores the\r
725 value of the macro EILSEQ in errno and returns (size_t)(-1);\r
726 the conversion state is unspecified. Otherwise, it returns\r
727 the number of bytes in the resulting multibyte character\r
728 sequence, not including the terminating null character (if any).\r
729\r
730 Declared in: wchar.h\r
731**/\r
732size_t\r
733wcsrtombs(\r
734 char *s,\r
735 const wchar_t **pwcs,\r
736 size_t n,\r
737 mbstate_t *ps\r
738)\r
739{\r
740 int count = 0;\r
741\r
742 /* s may be NULL */\r
743 /* pwcs may be NULL */\r
744 /* ps appears to be unused */\r
745\r
746 if (pwcs == NULL || *pwcs == NULL)\r
747 return (0);\r
748\r
749 if (s == NULL) {\r
750 while (*(*pwcs)++ != 0)\r
751 count++;\r
752 return(count);\r
753 }\r
754\r
755 if (n != 0) {\r
756 do {\r
757 if ((*s++ = (char) *(*pwcs)++) == 0) {\r
758 *pwcs = NULL;\r
759 break;\r
760 }\r
761 count++;\r
762 } while (--n != 0);\r
763 }\r
764\r
765 return count;\r
766}\r
767\r
768/** Convert a wide-character string into a multibyte character string.\r
769\r
770 The wcstombs function converts a sequence of wide characters from the\r
771 array pointed to by Src into a sequence of corresponding multibyte\r
772 characters that begins in the initial shift state, and stores these\r
773 multibyte characters into the array pointed to by Dest, stopping if a\r
774 multibyte character would exceed the limit of Limit total bytes or if a\r
775 null character is stored. Each wide character is converted as if by\r
776 a call to the wctomb function, except that the conversion state of\r
777 the wctomb function is not affected.\r
778\r
779 No more than Limit bytes will be modified in the array pointed to by Dest.\r
780 If copying takes place between objects that overlap,\r
781 the behavior is undefined.\r
782\r
783 @param[out] Dest Pointer to the array to receive the converted string.\r
784 @param[in] Src Pointer to the string to be converted.\r
785 @param[in] Limit Maximum number of elements to be written to Dest.\r
786\r
787 @return If a wide character is encountered that does not correspond to a\r
788 valid multibyte character, the wcstombs function returns\r
789 (size_t)(-1). Otherwise, the wcstombs function returns the number\r
790 of bytes modified, not including a terminating null character,\r
791 if any.\r
792\r
793 Declared in: stdlib.h\r
794**/\r
795size_t\r
796wcstombs(\r
797 char *s,\r
798 const wchar_t *pwcs,\r
799 size_t n\r
800)\r
801{\r
802 /* s may be NULL */\r
803 return wcsrtombs(s, &pwcs, n, NULL);\r
804}\r
805\r
806/** The wctob function determines whether C corresponds to a member of the extended\r
807 character set whose multibyte character representation is a single byte when in the initial\r
808 shift state.\r
809\r
810 @return The wctob function returns EOF if C does not correspond to a multibyte\r
811 character with length one in the initial shift state. Otherwise, it\r
812 returns the single-byte representation of that character as an\r
813 unsigned char converted to an int.\r
814\r
815 Declared in: wchar.h\r
816**/\r
817int\r
818wctob(wint_t c)\r
819{\r
820 /* wctob needs to be consistent with wcrtomb.\r
821 if wcrtomb says that a character is representable in 1 byte,\r
822 which this implementation always says, then wctob needs to\r
823 also represent the character as 1 byte.\r
824 */\r
825 if (c == WEOF) {\r
826 return EOF;\r
827 }\r
828 return (int)(c & 0xFF);\r
829}\r