]>
Commit | Line | Data |
---|---|---|
a7a8363d | 1 | /** @file\r |
2 | Copyright (c) 2012, Intel Corporation. All rights reserved.<BR>\r | |
3 | This program and the accompanying materials\r | |
4 | are licensed and made available under the terms and conditions of the BSD License\r | |
5 | which accompanies this distribution. The full text of the license may be found at\r | |
6 | http://opensource.org/licenses/bsd-license.php\r | |
7 | \r | |
8 | THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r | |
9 | WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r | |
10 | **/\r | |
11 | #include <assert.h>\r | |
12 | #include <string.h>\r | |
13 | #include <errno.h>\r | |
14 | #include <stdlib.h>\r | |
15 | #include <wchar.h>\r | |
16 | #include <sys/types.h>\r | |
17 | \r | |
18 | typedef int ch_UCS4;\r | |
19 | \r | |
20 | static mbstate_t LocalConvState = {0};\r | |
21 | \r | |
22 | /** Map a UTF-8 encoded prefix byte to a sequence length.\r | |
23 | Zero means illegal prefix, but valid surrogate if < 0xC0.\r | |
24 | One indicates an ASCII-7 equivalent character.\r | |
25 | Two, three, and four are the first byte for 2, 3, and 4 byte sequences, respectively.\r | |
26 | See RFC 3629 for details.\r | |
27 | \r | |
28 | TABLE ENCODING:\r | |
29 | Low Nibble decodes the first byte into the number of bytes in the sequence.\r | |
30 | A value of zero indicates an invalid byte.\r | |
31 | The High Nibble encodes a bit mask to be used to match against the high nibble of the second byte.\r | |
32 | \r | |
33 | example:\r | |
34 | SequenceLength = code[c0] & 0x0F;\r | |
35 | Mask = 0x80 | code[c0];\r | |
36 | \r | |
37 | Surrogate bytes are valid if: code[cX] & Mask > 0x80;\r | |
38 | \r | |
39 | */\r | |
40 | static\r | |
41 | UINT8 utf8_code_length[256] = {\r | |
42 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 00-0F */\r | |
43 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
44 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
45 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
46 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
47 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
48 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,\r | |
49 | 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, /* 70-7F */\r | |
50 | 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, /* 80-8F */\r | |
51 | 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, 0xA0, /* 90-9F */\r | |
52 | 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* A0-AF */\r | |
53 | 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, /* B0-BF */\r | |
54 | 0x00, 0x00, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* C0-C1 + C2-CF */\r | |
55 | 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, 0x72, /* D0-DF */\r | |
56 | 0x43, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x33, 0x73, 0x73, /* E0-EF */\r | |
57 | 0x64, 0x74, 0x74, 0x74, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* F0-F4 + F5-FF */\r | |
58 | };\r | |
59 | \r | |
60 | /** Process one byte of a multibyte character.\r | |
61 | \r | |
62 | @param ch\r | |
63 | @param ps\r | |
64 | \r | |
65 | @retval -2\r | |
66 | @retval -1\r | |
67 | @retval 1:4\r | |
68 | **/\r | |
69 | static\r | |
70 | int\r | |
71 | ProcessOneByte(unsigned char ch, mbstate_t *ps)\r | |
72 | {\r | |
73 | UINT32 Mask;\r | |
74 | UINT32 Length;\r | |
75 | int RetVal = 0;\r | |
76 | \r | |
77 | if(ps->A > 3) {\r | |
78 | // We are in an invalid state\r | |
79 | ps->A = 0; // Initial State\r | |
80 | }\r | |
81 | ps->C[ps->A] = ch; // Save the current character\r | |
82 | Mask = utf8_code_length[ch];\r | |
83 | \r | |
84 | if(ps->A == 0) { // Initial State. First byte of sequence.\r | |
85 | ps->E = Mask | 0x80;\r | |
86 | Length = Mask & 0xF;\r | |
87 | switch(Length) {\r | |
88 | case 0: // State 0, Code 0\r | |
89 | errno = EILSEQ;\r | |
90 | RetVal = -1;\r | |
91 | ps->E = 1; // Consume this character\r | |
92 | break;\r | |
93 | case 1: // State 0, Code 1\r | |
94 | // ASCII-7 Character\r | |
95 | ps->B = ps->D[0] = ch;\r | |
96 | RetVal = 1;\r | |
97 | break;\r | |
98 | default: // State 0, Code 2, 3, 4\r | |
99 | ps->A = 1; // Next state is State-1\r | |
100 | RetVal = -2; // Incomplete but potentially valid character\r | |
101 | break;\r | |
102 | }\r | |
103 | }\r | |
104 | else {\r | |
105 | // We are in state 1, 2, or 3 and processing a surrogate byte\r | |
106 | Length = ps->E & 0xF;\r | |
107 | if((Mask & ps->E) > 0x80) {\r | |
108 | // This byte is valid\r | |
109 | switch(ps->A) { // Process based upon our current state\r | |
110 | case 1: // Second byte of the sequence.\r | |
111 | if(Length == 2) { // State 1, Code 2\r | |
112 | Length = ((ps->C[0] & 0x1f) << 6) + (ps->C[1] & 0x3f);\r | |
113 | assert ((Length > 0x007F) && (Length <= 0x07FF));\r | |
114 | ps->B = ps->D[0] = (UINT16)Length;\r | |
115 | ps->A = 0; // Next state is State-0\r | |
116 | RetVal = 2;\r | |
117 | }\r | |
118 | else { // This isn't the last character, get more. State 1, Code 3 or 4\r | |
119 | ps->A = 2;\r | |
120 | RetVal = -2;\r | |
121 | }\r | |
122 | break;\r | |
123 | case 2: // Third byte of the sequence\r | |
124 | if(Length == 3) {\r | |
125 | Length = ((ps->C[0] & 0x0f) << 12) + ((ps->C[1] & 0x3f) << 6) + (ps->C[2] & 0x3f);\r | |
126 | assert ((Length > 0x07FF) && (Length <= 0xFFFF));\r | |
127 | ps->B = ps->D[0] = (UINT16)Length;\r | |
128 | ps->A = 0; // Next state is State-0\r | |
129 | RetVal = 3;\r | |
130 | }\r | |
131 | else {\r | |
132 | ps->A = 3;\r | |
133 | RetVal = -2;\r | |
134 | }\r | |
135 | break;\r | |
136 | case 3: // Fourth byte of the sequence\r | |
137 | if(Length == 4) {\r | |
138 | Length = ((ps->C[0] & 0x7) << 18) + ((ps->C[1] & 0x3f) << 12) +\r | |
139 | ((ps->C[2] & 0x3f) << 6) + (ps->C[3] & 0x3f);\r | |
140 | ps->B = Length;\r | |
141 | assert ((Length > 0xFFFF) && (Length <= 0x10ffff));\r | |
142 | \r | |
143 | /* compute and append the two surrogates: */\r | |
144 | \r | |
145 | /* translate from 10000..10FFFF to 0..FFFF */\r | |
146 | Length -= 0x10000;\r | |
147 | \r | |
148 | /* high surrogate = top 10 bits added to D800 */\r | |
149 | ps->D[0] = (UINT16)(0xD800 + (Length >> 10));\r | |
150 | \r | |
151 | /* low surrogate = bottom 10 bits added to DC00 */\r | |
152 | ps->D[1] = (UINT16)(0xDC00 + (Length & 0x03FF));\r | |
153 | ps->A = 0; // Next state is State-0\r | |
154 | RetVal = 4;\r | |
155 | }\r | |
156 | else {\r | |
157 | errno = EILSEQ;\r | |
158 | ps->A = 0;\r | |
159 | RetVal = -1;\r | |
160 | ps->E = 4; // Can't happen, but consume this character anyway\r | |
161 | }\r | |
162 | break;\r | |
163 | }\r | |
164 | }\r | |
165 | else { // Invalid surrogate character\r | |
166 | errno = EILSEQ;\r | |
167 | ps->A = 0; // Next is State-0\r | |
168 | RetVal = -1;\r | |
169 | ps->E = 0; // Don't Consume, it may be an initial byte\r | |
170 | }\r | |
171 | }\r | |
172 | return RetVal;\r | |
173 | }\r | |
174 | \r | |
175 | /** Convert one Multibyte sequence.\r | |
176 | \r | |
177 | @param Dest\r | |
178 | @param Src\r | |
179 | @param Len\r | |
180 | @param pS\r | |
181 | \r | |
182 | @retval -2 Bytes processed comprise an incomplete, but potentially valid, character.\r | |
183 | @retval -1 An encoding error was encountered. ps->E indicates the number of bytes consumed.\r | |
184 | @retval 0 Either Src is NULL or it points to a NUL character.\r | |
185 | @retval 1:N N bytes were consumed producing a valid wide character.\r | |
186 | **/\r | |
187 | int\r | |
188 | DecodeOneStateful(\r | |
189 | wchar_t *Dest, // Pointer to output location, or NULL\r | |
190 | const char *Src, // Multibyte Source (UTF8)\r | |
191 | ssize_t Len, // Max Number of bytes to convert\r | |
192 | mbstate_t *pS // Pointer to State struct., or NULL\r | |
193 | )\r | |
194 | {\r | |
195 | const char *SrcEnd;\r | |
196 | int NumConv;\r | |
197 | unsigned char ch;\r | |
198 | \r | |
199 | if((Src == NULL) || (*Src == '\0')) {\r | |
200 | return 0;\r | |
201 | }\r | |
202 | if(pS == NULL) {\r | |
203 | pS = &LocalConvState;\r | |
204 | }\r | |
205 | SrcEnd = Src + Len;\r | |
206 | NumConv = 0;\r | |
207 | while(Src < SrcEnd) {\r | |
208 | ch = (unsigned char)*Src++;\r | |
209 | NumConv = ProcessOneByte(ch, pS);\r | |
210 | if(NumConv != -2)\r | |
211 | break;\r | |
212 | }\r | |
213 | if((NumConv > 0) && (Dest != NULL)) {\r | |
214 | Dest[0] = pS->D[0];\r | |
215 | if(NumConv == 4) {\r | |
216 | Dest[1] = pS->D[1];\r | |
217 | }\r | |
218 | }\r | |
219 | return NumConv;\r | |
220 | }\r | |
221 | \r | |
222 | /** Convert wide characters (UTF16) into multibyte characters (UTF8)\r | |
223 | \r | |
224 | @param s Pointer to the wide-character string to convert\r | |
225 | @param size Number of wide characters in s. size <= wcslen(s);\r | |
226 | \r | |
227 | @return A newly allocated buffer containing the converted string is returned,\r | |
228 | or NULL if an error occurred. Global variable errno contains more\r | |
229 | information if NULL is returned.\r | |
230 | **/\r | |
231 | ssize_t\r | |
232 | EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)\r | |
233 | {\r | |
234 | char *p; /* next free byte in build buffer */\r | |
235 | char *v; /* next free byte in destination */\r | |
236 | ssize_t nneeded; /* number of result bytes needed */\r | |
237 | int i; /* index into s of next input byte */\r | |
238 | int NumInBuff; // number of bytes in Buff\r | |
239 | char Buff[4]; // Buffer into which each character is built\r | |
240 | \r | |
241 | assert(s != NULL);\r | |
242 | assert(size >= 0);\r | |
243 | \r | |
244 | v = Dest;\r | |
245 | nneeded = 0;\r | |
246 | if((size * MB_LEN_MAX) / MB_LEN_MAX != size) {\r | |
247 | // size is too large and resulted in overflow when multiplied by MB_LEN_MAX\r | |
248 | errno = EINVAL;\r | |
249 | return (ssize_t)-1;\r | |
250 | }\r | |
251 | \r | |
252 | for (i = 0; i < size;) {\r | |
253 | ch_UCS4 ch = s[i++];\r | |
254 | p = Buff;\r | |
255 | \r | |
256 | if (ch < 0x80) {\r | |
257 | /* Encode ASCII -- One Byte */\r | |
258 | *p++ = (char) ch;\r | |
259 | }\r | |
260 | else if (ch < 0x0800) {\r | |
261 | /* Encode Latin-1 -- Two Byte */\r | |
262 | *p++ = (char)(0xc0 | (ch >> 6));\r | |
263 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
264 | }\r | |
265 | else {\r | |
266 | /* Encode UCS2 Unicode ordinals -- Three Byte */\r | |
267 | /* Special case: check for high surrogate -- Shouldn't happen in UEFI */\r | |
268 | if (0xD800 <= ch && ch <= 0xDBFF && i < size) {\r | |
269 | ch_UCS4 ch2 = s[i];\r | |
270 | /* Check for low surrogate and combine the two to\r | |
271 | form a UCS4 value */\r | |
272 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r | |
273 | ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;\r | |
274 | i++;\r | |
275 | /* Encode UCS4 Unicode ordinals -- Four Byte */\r | |
276 | *p++ = (char)(0xf0 | (ch >> 18));\r | |
277 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));\r | |
278 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r | |
279 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
280 | continue;\r | |
281 | }\r | |
282 | /* Fall through: handles isolated high surrogates */\r | |
283 | }\r | |
284 | *p++ = (char)(0xe0 | (ch >> 12));\r | |
285 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r | |
286 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
287 | }\r | |
288 | /* At this point, Buff holds the converted character which is NumInBuff bytes long.\r | |
289 | NumInBuff is the value 1, 2, 3, or 4\r | |
290 | */\r | |
291 | NumInBuff = (int)(p - Buff); // Number of bytes in Buff\r | |
292 | if(Dest != NULL) { // Save character if Dest is not NULL\r | |
293 | memcpy(v, Buff, NumInBuff);\r | |
294 | v += NumInBuff;\r | |
295 | }\r | |
296 | nneeded += NumInBuff; // Keep track of the number of bytes put into Dest\r | |
297 | }\r | |
298 | if(Dest != NULL) {\r | |
299 | // Terminate the destination string.\r | |
300 | *v = '\0';\r | |
301 | }\r | |
302 | return nneeded; // Tell the caller\r | |
303 | }\r | |
304 | \r | |
305 | // ######################## Narrow to Wide Conversions #######################\r | |
306 | \r | |
307 | /** If ps is not a null pointer, the mbsinit function determines whether the\r | |
308 | pointed-to mbstate_t object describes an initial conversion state.\r | |
309 | \r | |
310 | @return The mbsinit function returns nonzero if ps is a null pointer\r | |
311 | or if the pointed-to object describes an initial conversion\r | |
312 | state; otherwise, it returns zero.\r | |
313 | \r | |
314 | Declared in: wchar.h\r | |
315 | **/\r | |
316 | int\r | |
317 | mbsinit(const mbstate_t *ps)\r | |
318 | {\r | |
319 | if((ps == NULL) || (ps->A == 0)) {\r | |
320 | return 1;\r | |
321 | }\r | |
322 | return 0;\r | |
323 | }\r | |
324 | \r | |
325 | /** The mbrlen function is equivalent to the call:<BR>\r | |
326 | @verbatim\r | |
327 | mbrtowc(NULL, s, n, ps != NULL ? ps : &internal)\r | |
328 | @endverbatim\r | |
329 | where internal is the mbstate_t object for the mbrlen function, except that\r | |
330 | the expression designated by ps is evaluated only once.\r | |
331 | \r | |
332 | @return The mbrlen function returns a value between zero and n,\r | |
333 | inclusive, (size_t)(-2), or (size_t)(-1).\r | |
334 | \r | |
335 | Declared in: wchar.h\r | |
336 | **/\r | |
337 | size_t\r | |
338 | mbrlen(\r | |
339 | const char *s,\r | |
340 | size_t n,\r | |
341 | mbstate_t *ps\r | |
342 | )\r | |
343 | {\r | |
344 | return mbrtowc(NULL, s, n, ps);\r | |
345 | }\r | |
346 | \r | |
347 | /** Determine the number of bytes comprising a multibyte character.\r | |
348 | \r | |
349 | If S is not a null pointer, the mblen function determines the number of bytes\r | |
350 | contained in the multibyte character pointed to by S. Except that the\r | |
351 | conversion state of the mbtowc function is not affected, it is equivalent to\r | |
352 | mbtowc((wchar_t *)0, S, N);\r | |
353 | \r | |
354 | @param[in] S NULL to query whether multibyte characters have\r | |
355 | state-dependent encodings. Otherwise, points to a\r | |
356 | multibyte character.\r | |
357 | @param[in] N The maximum number of bytes in a multibyte character.\r | |
358 | \r | |
359 | @return If S is a null pointer, the mblen function returns a nonzero or\r | |
360 | zero value, if multibyte character encodings, respectively, do\r | |
361 | or do not have state-dependent encodings. If S is not a null\r | |
362 | pointer, the mblen function either returns 0 (if S points to the\r | |
363 | null character), or returns the number of bytes that are contained\r | |
364 | in the multibyte character (if the next N or fewer bytes form a\r | |
365 | valid multibyte character), or returns -1 (if they do not form a\r | |
366 | valid multibyte character).\r | |
367 | \r | |
368 | Declared in: stdlib.h\r | |
369 | **/\r | |
370 | int\r | |
371 | mblen(\r | |
372 | const char *s,\r | |
373 | size_t n\r | |
374 | )\r | |
375 | {\r | |
376 | return (int)mbrlen(s, n, NULL);\r | |
377 | }\r | |
378 | \r | |
379 | /**\r | |
380 | If S is a null pointer, the mbrtowc function is equivalent to the call:<BR>\r | |
381 | @verbatim\r | |
382 | mbrtowc(NULL, "", 1, ps)\r | |
383 | @endverbatim\r | |
384 | \r | |
385 | In this case, the values of the parameters pwc and n are ignored.\r | |
386 | \r | |
387 | If S is not a null pointer, the mbrtowc function inspects at most n bytes beginning with\r | |
388 | the byte pointed to by S to determine the number of bytes needed to complete the next\r | |
389 | multibyte character (including any shift sequences). If the function determines that the\r | |
390 | next multibyte character is complete and valid, it determines the value of the\r | |
391 | corresponding wide character and then, if pwc is not a null pointer, stores that value in\r | |
392 | the object pointed to by pwc. If the corresponding wide character is the null wide\r | |
393 | character, the resulting state described is the initial conversion state.\r | |
394 | \r | |
395 | @retval 0 if the next n or fewer bytes complete the multibyte\r | |
396 | character that corresponds to the null wide\r | |
397 | character (which is the value stored).\r | |
398 | @retval between_1_and_n_inclusive if the next n or fewer bytes complete\r | |
399 | a valid multibyte character (which is the value\r | |
400 | stored); the value returned is the number of bytes\r | |
401 | that complete the multibyte character.\r | |
402 | @retval (size_t)(-2) if the next n bytes contribute to an incomplete\r | |
403 | (but potentially valid) multibyte character, and\r | |
404 | all n bytes have been processed (no value is stored).\r | |
405 | @retval (size_t)(-1) if an encoding error occurs, in which case the next\r | |
406 | n or fewer bytes do not contribute to a complete and\r | |
407 | valid multibyte character (no value is stored); the\r | |
408 | value of the macro EILSEQ is stored in errno, and\r | |
409 | the conversion state is unspecified.\r | |
410 | \r | |
411 | Declared in: wchar.h\r | |
412 | **/\r | |
413 | size_t\r | |
414 | mbrtowc(\r | |
415 | wchar_t *pwc,\r | |
416 | const char *s,\r | |
417 | size_t n,\r | |
418 | mbstate_t *ps\r | |
419 | )\r | |
420 | {\r | |
421 | int RetVal;\r | |
422 | \r | |
423 | RetVal = DecodeOneStateful(pwc, s, (ssize_t)n, ps);\r | |
424 | return (size_t)RetVal;\r | |
425 | }\r | |
426 | \r | |
427 | /** Convert a multibyte character into a wide character.\r | |
428 | \r | |
429 | If S is not a null pointer, the mbtowc function inspects at most N bytes\r | |
430 | beginning with the byte pointed to by S to determine the number of bytes\r | |
431 | needed to complete the next multibyte character (including any shift\r | |
432 | sequences). If the function determines that the next multibyte character\r | |
433 | is complete and valid, it determines the value of the corresponding wide\r | |
434 | character and then, if Pwc is not a null pointer, stores that value in\r | |
435 | the object pointed to by Pwc. If the corresponding wide character is the\r | |
436 | null wide character, the function is left in the initial conversion state.\r | |
437 | \r | |
438 | @param[out] Pwc Pointer to a wide-character object to receive the converted character.\r | |
439 | @param[in] S Pointer to a multibyte character to convert.\r | |
440 | @param[in] N Maximum number of bytes in a multibyte character.\r | |
441 | \r | |
442 | @return If S is a null pointer, the mbtowc function returns a nonzero or\r | |
443 | zero value, if multibyte character encodings, respectively, do\r | |
444 | or do not have state-dependent encodings. If S is not a null\r | |
445 | pointer, the mbtowc function either returns 0 (if S points to\r | |
446 | the null character), or returns the number of bytes that are\r | |
447 | contained in the converted multibyte character (if the next N or\r | |
448 | fewer bytes form a valid multibyte character), or returns -1\r | |
449 | (if they do not form a valid multibyte character).\r | |
450 | \r | |
451 | In no case will the value returned be greater than N or the value\r | |
452 | of the MB_CUR_MAX macro.\r | |
453 | \r | |
454 | Declared in: stdlib.h\r | |
455 | **/\r | |
456 | int\r | |
457 | mbtowc(\r | |
458 | wchar_t *pwc,\r | |
459 | const char *s,\r | |
460 | size_t n\r | |
461 | )\r | |
462 | {\r | |
463 | return (int)mbrtowc(pwc, s, n, NULL);\r | |
464 | }\r | |
465 | \r | |
466 | /**\r | |
467 | The mbsrtowcs function converts a sequence of multibyte characters that begins in the\r | |
468 | conversion state described by the object pointed to by ps, from the array indirectly\r | |
469 | pointed to by src into a sequence of corresponding wide characters. If dst is not a null\r | |
470 | pointer, the converted characters are stored into the array pointed to by dst. Conversion\r | |
471 | continues up to and including a terminating null character, which is also stored.\r | |
472 | Conversion stops earlier in two cases: when a sequence of bytes is encountered that does\r | |
473 | not form a valid multibyte character, or (if dst is not a null pointer) when len wide\r | |
474 | characters have been stored into the array pointed to by dst. Each conversion takes\r | |
475 | place as if by a call to the mbrtowc function.\r | |
476 | \r | |
477 | If dst is not a null pointer, the pointer object pointed to by src is assigned either a null\r | |
478 | pointer (if conversion stopped due to reaching a terminating null character) or the address\r | |
479 | just past the last multibyte character converted (if any). If conversion stopped due to\r | |
480 | reaching a terminating null character and if dst is not a null pointer, the resulting state\r | |
481 | described is the initial conversion state.\r | |
482 | \r | |
483 | @return If the input conversion encounters a sequence of bytes that do\r | |
484 | not form a valid multibyte character, an encoding error occurs:\r | |
485 | the mbsrtowcs function stores the value of the macro EILSEQ in\r | |
486 | errno and returns (size_t)(-1); the conversion state is\r | |
487 | unspecified. Otherwise, it returns the number of multibyte\r | |
488 | characters successfully converted, not including the terminating\r | |
489 | null character (if any).\r | |
490 | \r | |
491 | Declared in: wchar.h\r | |
492 | **/\r | |
493 | size_t\r | |
494 | mbsrtowcs(\r | |
495 | wchar_t *dst,\r | |
496 | const char **src,\r | |
497 | size_t len,\r | |
498 | mbstate_t *ps\r | |
499 | )\r | |
500 | {\r | |
501 | int x;\r | |
502 | size_t RetVal = 0;\r | |
503 | const char *MySrc;\r | |
504 | \r | |
505 | if((src == NULL) || (*src == NULL) || (**src == '\0')) {\r | |
506 | return 0;\r | |
507 | }\r | |
508 | \r | |
509 | MySrc = *src;\r | |
510 | for(x = 1 ; (len != 0) && (x > 0); --len) {\r | |
511 | x = DecodeOneStateful(dst, MySrc, MB_LEN_MAX, ps);\r | |
512 | switch(x) {\r | |
513 | case -2: // Incomplete character\r | |
514 | case -1: // Encoding error\r | |
515 | RetVal = (size_t)x;\r | |
516 | break;\r | |
517 | case 0: // Encountered NUL character: done.\r | |
518 | if(dst != NULL) {\r | |
519 | *dst = 0;\r | |
520 | *src = NULL;\r | |
521 | }\r | |
522 | break;\r | |
523 | default: // Successfully decoded a character, continue with next\r | |
524 | MySrc += x;\r | |
525 | if(dst != NULL) {\r | |
526 | ++dst;\r | |
527 | if(x == 4) {\r | |
528 | ++dst;\r | |
529 | }\r | |
530 | *src = MySrc;\r | |
531 | }\r | |
532 | ++RetVal;\r | |
533 | break;\r | |
534 | }\r | |
535 | }\r | |
536 | return RetVal;\r | |
537 | }\r | |
538 | \r | |
539 | /** Convert a multibyte character string into a wide-character string.\r | |
540 | \r | |
541 | The mbstowcs function converts a sequence of multibyte characters that\r | |
542 | begins in the initial shift state from the array pointed to by Src into\r | |
543 | a sequence of corresponding wide characters and stores not more than limit\r | |
544 | wide characters into the array pointed to by Dest. No multibyte\r | |
545 | characters that follow a null character (which is converted into a null\r | |
546 | wide character) will be examined or converted. Each multibyte character\r | |
547 | is converted as if by a call to the mbtowc function, except that the\r | |
548 | conversion state of the mbtowc function is not affected.\r | |
549 | \r | |
550 | No more than Limit elements will be modified in the array pointed to by Dest.\r | |
551 | If copying takes place between objects that overlap,\r | |
552 | the behavior is undefined.\r | |
553 | \r | |
554 | @param[out] Dest Pointer to the array to receive the converted string.\r | |
555 | @param[in] Src Pointer to the string to be converted.\r | |
556 | @param[in] Limit Maximum number of elements to be written to Dest.\r | |
557 | \r | |
558 | @return If an invalid multibyte character is encountered, the mbstowcs\r | |
559 | function returns (size_t)(-1). Otherwise, the mbstowcs function\r | |
560 | returns the number of array elements modified, not including a\r | |
561 | terminating null wide character, if any.\r | |
562 | \r | |
563 | Declared in: stdlib.h\r | |
564 | **/\r | |
565 | size_t\r | |
566 | mbstowcs(\r | |
567 | wchar_t *pwcs,\r | |
568 | const char *s,\r | |
569 | size_t n\r | |
570 | )\r | |
571 | {\r | |
572 | \r | |
573 | /* pwcs may be NULL */\r | |
574 | /* s may be NULL */\r | |
575 | \r | |
576 | return mbsrtowcs(pwcs, &s, n, NULL);\r | |
577 | }\r | |
578 | \r | |
579 | /** The btowc function determines whether C constitutes a valid single-byte\r | |
580 | character in the initial shift state.\r | |
581 | \r | |
582 | @return The btowc function returns WEOF if c has the value EOF or if\r | |
583 | (unsigned char)C does not constitute a valid single-byte\r | |
584 | character in the initial shift state. Otherwise, it returns the\r | |
585 | wide character representation of that character.\r | |
586 | \r | |
587 | Declared in: wchar.h\r | |
588 | **/\r | |
589 | wint_t\r | |
590 | btowc(int c)\r | |
591 | {\r | |
592 | int x;\r | |
593 | wchar_t Dest;\r | |
594 | wint_t RetVal = WEOF;\r | |
595 | \r | |
596 | if (c == EOF)\r | |
597 | return WEOF;\r | |
598 | x = DecodeOneStateful(&Dest, (const char *)&c, 1, NULL);\r | |
599 | if(x == 0) {\r | |
600 | RetVal = 0;\r | |
601 | }\r | |
602 | else if(x == 1) {\r | |
603 | RetVal = (wint_t)Dest;\r | |
604 | }\r | |
605 | return RetVal;\r | |
606 | }\r | |
607 | \r | |
608 | // ######################## Wide to Narrow Conversions #######################\r | |
609 | \r | |
610 | /**\r | |
611 | If S is a null pointer, the wcrtomb function is equivalent to the call:<BR>\r | |
612 | @verbatim\r | |
613 | wcrtomb(buf, L'\0', ps)\r | |
614 | @endverbatim\r | |
615 | where buf is an internal buffer.\r | |
616 | \r | |
617 | If S is not a null pointer, the wcrtomb function determines the number of bytes needed\r | |
618 | to represent the multibyte character that corresponds to the wide character given by wc\r | |
619 | (including any shift sequences), and stores the multibyte character representation in the\r | |
620 | array whose first element is pointed to by S. At most MB_CUR_MAX bytes are stored. If\r | |
621 | wc is a null wide character, a null byte is stored, preceded by any shift sequence needed\r | |
622 | to restore the initial shift state; the resulting state described is the initial conversion state.\r | |
623 | \r | |
624 | @return The wcrtomb function returns the number of bytes stored in the\r | |
625 | array object (including any shift sequences). When wc is not a\r | |
626 | valid wide character, an encoding error occurs: the function\r | |
627 | stores the value of the macro EILSEQ in errno and\r | |
628 | returns (size_t)(-1); the conversion state is unspecified.\r | |
629 | \r | |
630 | Declared in: wchar.h\r | |
631 | **/\r | |
632 | size_t\r | |
633 | wcrtomb(\r | |
634 | char *s,\r | |
635 | wchar_t wchar,\r | |
636 | mbstate_t *ps\r | |
637 | )\r | |
638 | {\r | |
639 | size_t RetVal;\r | |
640 | \r | |
641 | /* s may be NULL */\r | |
642 | if (s == NULL) {\r | |
643 | RetVal = 1;\r | |
644 | }\r | |
645 | else {\r | |
646 | if (wchar == L'\0') {\r | |
647 | *s = '\0';\r | |
648 | RetVal = 1;\r | |
649 | }\r | |
650 | else {\r | |
651 | RetVal = EncodeUtf8(s, &wchar, 1);\r | |
652 | }\r | |
653 | }\r | |
654 | return RetVal;\r | |
655 | }\r | |
656 | \r | |
657 | /** Convert a wide character into a multibyte character.\r | |
658 | \r | |
659 | The wctomb function determines the number of bytes needed to represent the\r | |
660 | multibyte character corresponding to the wide character given by WC\r | |
661 | (including any shift sequences), and stores the multibyte character\r | |
662 | representation in the array whose first element is pointed to by S (if S is\r | |
663 | not a null pointer). At most MB_CUR_MAX characters are stored. If WC is a\r | |
664 | null wide character, a null byte is stored, preceded by any shift sequence\r | |
665 | needed to restore the initial shift state, and the function is left in the\r | |
666 | initial conversion state.\r | |
667 | \r | |
668 | @param[out] S Pointer to the object to receive the converted multibyte character.\r | |
669 | @param[in] WC Wide character to be converted.\r | |
670 | \r | |
671 | @return If S is a null pointer, the wctomb function returns a nonzero or\r | |
672 | zero value, if multibyte character encodings, respectively, do or\r | |
673 | do not have state-dependent encodings. If S is not a null pointer,\r | |
674 | the wctomb function returns -1 if the value of WC does not\r | |
675 | correspond to a valid multibyte character, or returns the number\r | |
676 | of bytes that are contained in the multibyte character\r | |
677 | corresponding to the value of WC.\r | |
678 | \r | |
679 | In no case will the value returned be greater than the value of\r | |
680 | the MB_CUR_MAX macro.\r | |
681 | \r | |
682 | Declared in: stdlib.h\r | |
683 | **/\r | |
684 | int\r | |
685 | wctomb(\r | |
686 | char *s,\r | |
687 | wchar_t wchar\r | |
688 | )\r | |
689 | {\r | |
690 | /*\r | |
691 | If s is NULL just return whether MB Characters have state\r | |
692 | dependent encodings -- they don't.\r | |
693 | */\r | |
694 | if (s == NULL)\r | |
695 | return 0;\r | |
696 | \r | |
697 | return (int)wcrtomb(s, wchar, NULL);\r | |
698 | }\r | |
699 | \r | |
700 | /** The wcsrtombs function converts a sequence of wide characters from the array\r | |
701 | indirectly pointed to by S into a sequence of corresponding multibyte\r | |
702 | characters that begins in the conversion state described by the object\r | |
703 | pointed to by ps.\r | |
704 | \r | |
705 | If S is not a null pointer, the converted characters\r | |
706 | are then stored into the array pointed to by S. Conversion continues\r | |
707 | up to and including a terminating null wide character, which is also\r | |
708 | stored. Conversion stops earlier in two cases: when a wide character is\r | |
709 | reached that does not correspond to a valid multibyte character, or\r | |
710 | (if S is not a null pointer) when the next multibyte character would\r | |
711 | exceed the limit of N total bytes to be stored into the array pointed\r | |
712 | to by S. Each conversion takes place as if by a call to the wcrtomb\r | |
713 | function.)\r | |
714 | \r | |
715 | If S is not a null pointer, the pointer object pointed to by pwcs is\r | |
716 | assigned either a null pointer (if conversion stopped due to reaching\r | |
717 | a terminating null wide character) or the address just past the last wide\r | |
718 | character converted (if any). If conversion stopped due to reaching a\r | |
719 | terminating null wide character, the resulting state described is the\r | |
720 | initial conversion state.\r | |
721 | \r | |
722 | @return If conversion stops because a wide character is reached that\r | |
723 | does not correspond to a valid multibyte character, an\r | |
724 | encoding error occurs: the wcsrtombs function stores the\r | |
725 | value of the macro EILSEQ in errno and returns (size_t)(-1);\r | |
726 | the conversion state is unspecified. Otherwise, it returns\r | |
727 | the number of bytes in the resulting multibyte character\r | |
728 | sequence, not including the terminating null character (if any).\r | |
729 | \r | |
730 | Declared in: wchar.h\r | |
731 | **/\r | |
732 | size_t\r | |
733 | wcsrtombs(\r | |
734 | char *s,\r | |
735 | const wchar_t **pwcs,\r | |
736 | size_t n,\r | |
737 | mbstate_t *ps\r | |
738 | )\r | |
739 | {\r | |
740 | int count = 0;\r | |
741 | \r | |
742 | /* s may be NULL */\r | |
743 | /* pwcs may be NULL */\r | |
744 | /* ps appears to be unused */\r | |
745 | \r | |
746 | if (pwcs == NULL || *pwcs == NULL)\r | |
747 | return (0);\r | |
748 | \r | |
749 | if (s == NULL) {\r | |
750 | while (*(*pwcs)++ != 0)\r | |
751 | count++;\r | |
752 | return(count);\r | |
753 | }\r | |
754 | \r | |
755 | if (n != 0) {\r | |
756 | do {\r | |
757 | if ((*s++ = (char) *(*pwcs)++) == 0) {\r | |
758 | *pwcs = NULL;\r | |
759 | break;\r | |
760 | }\r | |
761 | count++;\r | |
762 | } while (--n != 0);\r | |
763 | }\r | |
764 | \r | |
765 | return count;\r | |
766 | }\r | |
767 | \r | |
768 | /** Convert a wide-character string into a multibyte character string.\r | |
769 | \r | |
770 | The wcstombs function converts a sequence of wide characters from the\r | |
771 | array pointed to by Src into a sequence of corresponding multibyte\r | |
772 | characters that begins in the initial shift state, and stores these\r | |
773 | multibyte characters into the array pointed to by Dest, stopping if a\r | |
774 | multibyte character would exceed the limit of Limit total bytes or if a\r | |
775 | null character is stored. Each wide character is converted as if by\r | |
776 | a call to the wctomb function, except that the conversion state of\r | |
777 | the wctomb function is not affected.\r | |
778 | \r | |
779 | No more than Limit bytes will be modified in the array pointed to by Dest.\r | |
780 | If copying takes place between objects that overlap,\r | |
781 | the behavior is undefined.\r | |
782 | \r | |
783 | @param[out] Dest Pointer to the array to receive the converted string.\r | |
784 | @param[in] Src Pointer to the string to be converted.\r | |
785 | @param[in] Limit Maximum number of elements to be written to Dest.\r | |
786 | \r | |
787 | @return If a wide character is encountered that does not correspond to a\r | |
788 | valid multibyte character, the wcstombs function returns\r | |
789 | (size_t)(-1). Otherwise, the wcstombs function returns the number\r | |
790 | of bytes modified, not including a terminating null character,\r | |
791 | if any.\r | |
792 | \r | |
793 | Declared in: stdlib.h\r | |
794 | **/\r | |
795 | size_t\r | |
796 | wcstombs(\r | |
797 | char *s,\r | |
798 | const wchar_t *pwcs,\r | |
799 | size_t n\r | |
800 | )\r | |
801 | {\r | |
802 | /* s may be NULL */\r | |
803 | return wcsrtombs(s, &pwcs, n, NULL);\r | |
804 | }\r | |
805 | \r | |
806 | /** The wctob function determines whether C corresponds to a member of the extended\r | |
807 | character set whose multibyte character representation is a single byte when in the initial\r | |
808 | shift state.\r | |
809 | \r | |
810 | @return The wctob function returns EOF if C does not correspond to a multibyte\r | |
811 | character with length one in the initial shift state. Otherwise, it\r | |
812 | returns the single-byte representation of that character as an\r | |
813 | unsigned char converted to an int.\r | |
814 | \r | |
815 | Declared in: wchar.h\r | |
816 | **/\r | |
817 | int\r | |
818 | wctob(wint_t c)\r | |
819 | {\r | |
820 | /* wctob needs to be consistent with wcrtomb.\r | |
821 | if wcrtomb says that a character is representable in 1 byte,\r | |
822 | which this implementation always says, then wctob needs to\r | |
823 | also represent the character as 1 byte.\r | |
824 | */\r | |
825 | if (c == WEOF) {\r | |
826 | return EOF;\r | |
827 | }\r | |
828 | return (int)(c & 0xFF);\r | |
829 | }\r |