]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // |
2 | // Copyright (c) 2015 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
20effc67 | 5 | // accompanying file LICENSE or copy at |
f67539c2 TL |
6 | // http://www.boost.org/LICENSE_1_0.txt) |
7 | // | |
8 | ||
9 | #include <boost/nowide/utf8_codecvt.hpp> | |
10 | ||
11 | #include <boost/nowide/convert.hpp> | |
1e59de90 TL |
12 | #include "test.hpp" |
13 | #include "test_sets.hpp" | |
f67539c2 TL |
14 | #include <cstring> |
15 | #include <iomanip> | |
16 | #include <iostream> | |
17 | #include <locale> | |
18 | #include <vector> | |
19 | ||
1e59de90 TL |
20 | // MSVC has problems with an undefined symbol std::codecvt::id in some versions if the utf char types are used. See |
21 | // https://social.msdn.microsoft.com/Forums/vstudio/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral | |
22 | // Workaround: use int16_t instead of char16_t | |
23 | #if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER <= 1916 | |
24 | #define BOOST_NOWIDE_REQUIRE_UTF_CHAR_WORKAROUND 1 | |
25 | #else | |
26 | #define BOOST_NOWIDE_REQUIRE_UTF_CHAR_WORKAROUND 0 | |
27 | #endif | |
f67539c2 TL |
28 | |
29 | static const char* utf8_name = | |
30 | "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt"; | |
31 | static const std::wstring wide_name_str = boost::nowide::widen(utf8_name); | |
32 | static const wchar_t* wide_name = wide_name_str.c_str(); | |
33 | ||
20effc67 | 34 | using cvt_type = std::codecvt<wchar_t, char, std::mbstate_t>; |
f67539c2 | 35 | |
1e59de90 TL |
36 | #if BOOST_NOWIDE_REQUIRE_UTF_CHAR_WORKAROUND |
37 | using utf16_char_t = int16_t; | |
38 | using utf32_char_t = int32_t; | |
39 | #else | |
40 | using utf16_char_t = char16_t; | |
41 | using utf32_char_t = char32_t; | |
42 | #endif | |
43 | ||
44 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN | |
45 | using cvt_type16 = std::codecvt<utf16_char_t, char, std::mbstate_t>; | |
46 | using cvt_type32 = std::codecvt<utf32_char_t, char, std::mbstate_t>; | |
47 | using utf8_utf16_codecvt = boost::nowide::utf8_codecvt<utf16_char_t>; | |
48 | using utf8_utf32_codecvt = boost::nowide::utf8_codecvt<utf32_char_t>; | |
49 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END | |
50 | ||
51 | void test_codecvt_basic() | |
52 | { | |
53 | // UTF-16 | |
54 | { | |
55 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN | |
56 | std::locale l(std::locale::classic(), new utf8_utf16_codecvt()); | |
57 | const cvt_type16& cvt = std::use_facet<cvt_type16>(l); | |
58 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END | |
59 | TEST_EQ(cvt.encoding(), 0); // Characters have a variable width | |
60 | TEST_EQ(cvt.max_length(), 4); // At most 4 UTF-8 code units are one internal char (one or two UTF-16 code units) | |
61 | TEST(!cvt.always_noconv()); // Always convert | |
62 | } | |
63 | // UTF-32 | |
64 | { | |
65 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN | |
66 | std::locale l(std::locale::classic(), new utf8_utf32_codecvt()); | |
67 | const cvt_type32& cvt = std::use_facet<cvt_type32>(l); | |
68 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END | |
69 | TEST_EQ(cvt.encoding(), 0); // Characters have a variable width | |
70 | TEST_EQ(cvt.max_length(), 4); // At most 4 UTF-8 code units are one internal char (one UTF-32 code unit) | |
71 | TEST(!cvt.always_noconv()); // Always convert | |
72 | } | |
73 | } | |
74 | ||
75 | void test_codecvt_unshift() | |
76 | { | |
77 | char buf[256]; | |
78 | // UTF-16 | |
79 | { | |
80 | const auto name16 = | |
81 | boost::nowide::utf::convert_string<utf16_char_t>(utf8_name, utf8_name + std::strlen(utf8_name)); | |
82 | ||
83 | utf8_utf16_codecvt cvt16; | |
84 | // Unshift on initial state does nothing | |
85 | std::mbstate_t mb{}; | |
86 | char* to_next; | |
87 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN | |
88 | const cvt_type16& cvt = cvt16; | |
89 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type16::ok); | |
90 | TEST(to_next == buf); | |
91 | const utf16_char_t* from_next; | |
92 | // Convert into a to small buffer | |
93 | TEST_EQ(cvt.out(mb, &name16.front(), &name16.back(), from_next, buf, buf + 1, to_next), cvt_type16::partial); | |
94 | TEST(from_next == &name16[1]); | |
95 | TEST(to_next == buf); | |
96 | // Unshift on non-default state is not possible | |
97 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type16::error); | |
98 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END | |
99 | } | |
100 | // UTF-32 | |
101 | { | |
102 | const auto name32 = | |
103 | boost::nowide::utf::convert_string<utf32_char_t>(utf8_name, utf8_name + std::strlen(utf8_name)); | |
104 | ||
105 | utf8_utf32_codecvt cvt32; | |
106 | // Unshift on initial state does nothing | |
107 | std::mbstate_t mb{}; | |
108 | char* to_next; | |
109 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN | |
110 | const cvt_type32& cvt = cvt32; | |
111 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type32::noconv); | |
112 | TEST(to_next == buf); | |
113 | const utf32_char_t* from_next; | |
114 | // Convert into a too small buffer | |
115 | TEST_EQ(cvt.out(mb, &name32.front(), &name32.back(), from_next, buf, buf + 1, to_next), cvt_type32::partial); | |
116 | TEST(from_next == &name32.front()); // Noting consumed | |
117 | TEST(to_next == buf); | |
118 | TEST(std::mbsinit(&mb) != 0); // State unchanged --> Unshift does nothing | |
119 | TEST_EQ(cvt.unshift(mb, buf, std::end(buf), to_next), cvt_type32::noconv); | |
120 | TEST(to_next == buf); | |
121 | BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END | |
122 | } | |
123 | } | |
124 | ||
f67539c2 TL |
125 | void test_codecvt_in_n_m(const cvt_type& cvt, size_t n, size_t m) |
126 | { | |
127 | const wchar_t* wptr = wide_name; | |
128 | size_t wlen = std::wcslen(wide_name); | |
129 | size_t u8len = std::strlen(utf8_name); | |
130 | const char* from = utf8_name; | |
1e59de90 | 131 | const char* from_end = from; |
f67539c2 TL |
132 | const char* real_end = utf8_name + u8len; |
133 | const char* from_next = from; | |
20effc67 | 134 | std::mbstate_t mb{}; |
f67539c2 TL |
135 | while(from_next < real_end) |
136 | { | |
1e59de90 | 137 | if(from == from_end) |
f67539c2 | 138 | { |
1e59de90 TL |
139 | from_end = from + n; |
140 | if(from_end > real_end) | |
141 | from_end = real_end; | |
f67539c2 TL |
142 | } |
143 | ||
144 | wchar_t buf[128]; | |
145 | wchar_t* to = buf; | |
146 | wchar_t* to_end = to + m; | |
147 | wchar_t* to_next = to; | |
148 | ||
149 | std::mbstate_t mb2 = mb; | |
1e59de90 | 150 | std::codecvt_base::result r = cvt.in(mb, from, from_end, from_next, to, to_end, to_next); |
f67539c2 | 151 | |
1e59de90 TL |
152 | int count = cvt.length(mb2, from, from_end, to_end - to); |
153 | TEST_EQ(std::memcmp(&mb, &mb2, sizeof(mb)), 0); | |
154 | TEST_EQ(count, from_next - from); | |
f67539c2 TL |
155 | |
156 | if(r == cvt_type::partial) | |
157 | { | |
1e59de90 TL |
158 | from_end += n; |
159 | if(from_end > real_end) | |
160 | from_end = real_end; | |
f67539c2 | 161 | } else |
1e59de90 | 162 | TEST_EQ(r, cvt_type::ok); |
f67539c2 TL |
163 | while(to != to_next) |
164 | { | |
165 | TEST(*wptr == *to); | |
166 | wptr++; | |
167 | to++; | |
168 | } | |
169 | to = to_next; | |
170 | from = from_next; | |
171 | } | |
172 | TEST(wptr == wide_name + wlen); | |
173 | TEST(from == real_end); | |
174 | } | |
175 | ||
176 | void test_codecvt_out_n_m(const cvt_type& cvt, size_t n, size_t m) | |
177 | { | |
178 | const char* nptr = utf8_name; | |
179 | size_t wlen = std::wcslen(wide_name); | |
180 | size_t u8len = std::strlen(utf8_name); | |
181 | ||
20effc67 | 182 | std::mbstate_t mb{}; |
f67539c2 TL |
183 | |
184 | const wchar_t* from_next = wide_name; | |
185 | const wchar_t* real_from_end = wide_name + wlen; | |
186 | ||
187 | char buf[256]; | |
188 | char* to = buf; | |
189 | char* to_next = to; | |
190 | char* to_end = to + n; | |
191 | char* real_to_end = buf + sizeof(buf); | |
192 | ||
193 | while(from_next < real_from_end) | |
194 | { | |
195 | const wchar_t* from = from_next; | |
196 | const wchar_t* from_end = from + m; | |
197 | if(from_end > real_from_end) | |
198 | from_end = real_from_end; | |
199 | if(to_end == to) | |
200 | { | |
201 | to_end = to + n; | |
202 | } | |
203 | ||
204 | std::codecvt_base::result r = cvt.out(mb, from, from_end, from_next, to, to_end, to_next); | |
205 | if(r == cvt_type::partial) | |
206 | { | |
207 | // If those are equal, then "partial" probably means: Need more input | |
208 | // Otherwise "Need more output" | |
209 | if(from_next != from_end) | |
210 | { | |
211 | TEST(to_end - to_next < cvt.max_length()); | |
212 | to_end += n; | |
1e59de90 | 213 | TEST(to_end <= real_to_end); // Should always be big enough |
f67539c2 TL |
214 | } |
215 | } else | |
216 | { | |
1e59de90 | 217 | TEST_EQ(r, cvt_type::ok); |
f67539c2 TL |
218 | } |
219 | ||
220 | while(to != to_next) | |
221 | { | |
1e59de90 | 222 | TEST_EQ(*nptr, *to); |
f67539c2 TL |
223 | nptr++; |
224 | to++; | |
225 | } | |
226 | from = from_next; | |
227 | } | |
228 | TEST(nptr == utf8_name + u8len); | |
229 | TEST(from_next == real_from_end); | |
1e59de90 TL |
230 | const auto expected = (sizeof(wchar_t) == 2) ? cvt_type::ok : cvt_type::noconv; // UTF-32 is not state-dependent |
231 | TEST_EQ(cvt.unshift(mb, to, to + n, to_next), expected); | |
f67539c2 TL |
232 | TEST(to_next == to); |
233 | } | |
234 | ||
235 | void test_codecvt_conv() | |
236 | { | |
237 | std::cout << "Conversions " << std::endl; | |
238 | std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>()); | |
239 | ||
240 | const cvt_type& cvt = std::use_facet<cvt_type>(l); | |
241 | const size_t utf8_len = std::strlen(utf8_name); | |
242 | const size_t wide_len = std::wcslen(wide_name); | |
243 | ||
244 | for(size_t i = 1; i <= utf8_len + 1; i++) | |
245 | { | |
246 | for(size_t j = 1; j <= wide_len + 1; j++) | |
247 | { | |
248 | try | |
249 | { | |
250 | test_codecvt_in_n_m(cvt, i, j); | |
251 | test_codecvt_out_n_m(cvt, i, j); | |
1e59de90 | 252 | } catch(...) // LCOV_EXCL_LINE |
f67539c2 | 253 | { |
1e59de90 TL |
254 | std::cerr << "Wlen=" << j << " Nlen=" << i << std::endl; // LCOV_EXCL_LINE |
255 | throw; // LCOV_EXCL_LINE | |
f67539c2 TL |
256 | } |
257 | } | |
258 | } | |
259 | } | |
260 | ||
261 | void test_codecvt_err() | |
262 | { | |
263 | std::cout << "Errors " << std::endl; | |
264 | std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>()); | |
265 | ||
266 | const cvt_type& cvt = std::use_facet<cvt_type>(l); | |
267 | ||
268 | std::cout << "- UTF-8" << std::endl; | |
269 | { | |
270 | { | |
271 | wchar_t buf[4]; | |
272 | wchar_t* const to = buf; | |
273 | wchar_t* const to_end = buf + 4; | |
274 | const char* err_utf = "1\xFF\xFF\xd7\xa9"; | |
20effc67 | 275 | std::mbstate_t mb{}; |
f67539c2 TL |
276 | const char* from = err_utf; |
277 | const char* from_end = from + std::strlen(from); | |
278 | const char* from_next = from; | |
279 | wchar_t* to_next = to; | |
1e59de90 | 280 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::ok); |
f67539c2 TL |
281 | TEST(from_next == from + 5); |
282 | TEST(to_next == to + 4); | |
283 | TEST(std::wstring(to, to_end) == boost::nowide::widen(err_utf)); | |
284 | } | |
285 | { | |
286 | wchar_t buf[4]; | |
287 | wchar_t* const to = buf; | |
288 | wchar_t* const to_end = buf + 4; | |
289 | const char* err_utf = "1\xd7"; // 1 valid, 1 incomplete UTF-8 char | |
20effc67 | 290 | std::mbstate_t mb{}; |
f67539c2 TL |
291 | const char* from = err_utf; |
292 | const char* from_end = from + std::strlen(from); | |
293 | const char* from_next = from; | |
294 | wchar_t* to_next = to; | |
1e59de90 | 295 | TEST_EQ(cvt.in(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::partial); |
f67539c2 TL |
296 | TEST(from_next == from + 1); |
297 | TEST(to_next == to + 1); | |
298 | TEST(std::wstring(to, to_next) == std::wstring(L"1")); | |
299 | } | |
300 | { | |
301 | char buf[4] = {}; | |
302 | char* const to = buf; | |
303 | char* const to_end = buf + 4; | |
304 | char* to_next = to; | |
305 | const wchar_t* err_utf = L"\xD800"; // Trailing UTF-16 surrogate | |
20effc67 | 306 | std::mbstate_t mb{}; |
f67539c2 TL |
307 | const wchar_t* from = err_utf; |
308 | const wchar_t* from_end = from + 1; | |
309 | const wchar_t* from_next = from; | |
310 | cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next); | |
311 | #ifdef BOOST_MSVC | |
312 | #pragma warning(disable : 4127) // Constant expression detected | |
313 | #endif | |
314 | if(sizeof(wchar_t) == 2) | |
315 | { | |
1e59de90 | 316 | TEST_EQ(res, cvt_type::partial); |
f67539c2 TL |
317 | TEST(from_next == from_end); |
318 | TEST(to_next == to); | |
1e59de90 | 319 | TEST_EQ(buf[0], 0); |
f67539c2 TL |
320 | } else |
321 | { | |
1e59de90 | 322 | TEST_EQ(res, cvt_type::ok); |
f67539c2 TL |
323 | TEST(from_next == from_end); |
324 | TEST(to_next == to + 3); | |
325 | // surrogate is invalid | |
1e59de90 | 326 | TEST_EQ(std::string(to, to_next), boost::nowide::narrow(wreplacement_str)); |
f67539c2 TL |
327 | } |
328 | } | |
329 | } | |
330 | ||
331 | std::cout << "- UTF-16/32" << std::endl; | |
332 | { | |
333 | char buf[32]; | |
334 | char* to = buf; | |
335 | char* to_end = buf + 32; | |
336 | char* to_next = to; | |
337 | wchar_t err_buf[3] = {'1', 0xDC9E, 0}; // second surrogate not works both for UTF-16 and 32 | |
338 | const wchar_t* err_utf = err_buf; | |
339 | { | |
20effc67 | 340 | std::mbstate_t mb{}; |
f67539c2 TL |
341 | const wchar_t* from = err_utf; |
342 | const wchar_t* from_end = from + std::wcslen(from); | |
343 | const wchar_t* from_next = from; | |
1e59de90 | 344 | TEST_EQ(cvt.out(mb, from, from_end, from_next, to, to_end, to_next), cvt_type::ok); |
f67539c2 TL |
345 | TEST(from_next == from + 2); |
346 | TEST(to_next == to + 4); | |
1e59de90 | 347 | TEST_EQ(std::string(to, to_next), "1" + boost::nowide::narrow(wreplacement_str)); |
f67539c2 TL |
348 | } |
349 | } | |
350 | } | |
351 | ||
352 | std::wstring codecvt_to_wide(const std::string& s) | |
353 | { | |
354 | std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>()); | |
355 | ||
356 | const cvt_type& cvt = std::use_facet<cvt_type>(l); | |
357 | ||
20effc67 | 358 | std::mbstate_t mb{}; |
f67539c2 TL |
359 | const char* const from = s.c_str(); |
360 | const char* const from_end = from + s.size(); | |
361 | const char* from_next = from; | |
362 | ||
363 | std::vector<wchar_t> buf(s.size() + 2); // +1 for possible incomplete char, +1 for NULL | |
364 | wchar_t* const to = &buf[0]; | |
365 | wchar_t* const to_end = to + buf.size(); | |
366 | wchar_t* to_next = to; | |
367 | ||
1e59de90 | 368 | const auto expected_consumed = cvt.length(mb, from, from_end, buf.size()); |
f67539c2 | 369 | cvt_type::result res = cvt.in(mb, from, from_end, from_next, to, to_end, to_next); |
1e59de90 | 370 | TEST_EQ(expected_consumed, from_next - from); |
f67539c2 TL |
371 | if(res == cvt_type::partial) |
372 | { | |
373 | TEST(to_next < to_end); | |
374 | *(to_next++) = BOOST_NOWIDE_REPLACEMENT_CHARACTER; | |
375 | } else | |
1e59de90 | 376 | TEST_EQ(res, cvt_type::ok); |
f67539c2 TL |
377 | |
378 | return std::wstring(to, to_next); | |
379 | } | |
380 | ||
381 | std::string codecvt_to_narrow(const std::wstring& s) | |
382 | { | |
383 | std::locale l(std::locale::classic(), new boost::nowide::utf8_codecvt<wchar_t>()); | |
384 | ||
385 | const cvt_type& cvt = std::use_facet<cvt_type>(l); | |
386 | ||
20effc67 | 387 | std::mbstate_t mb{}; |
f67539c2 TL |
388 | const wchar_t* const from = s.c_str(); |
389 | const wchar_t* const from_end = from + s.size(); | |
390 | const wchar_t* from_next = from; | |
391 | ||
392 | std::vector<char> buf((s.size() + 1) * 4 + 1); // +1 for possible incomplete char, +1 for NULL | |
393 | char* const to = &buf[0]; | |
394 | char* const to_end = to + buf.size(); | |
395 | char* to_next = to; | |
396 | ||
397 | cvt_type::result res = cvt.out(mb, from, from_end, from_next, to, to_end, to_next); | |
398 | if(res == cvt_type::partial) | |
399 | { | |
400 | TEST(to_next < to_end); | |
401 | return std::string(to, to_next) + boost::nowide::narrow(wreplacement_str); | |
402 | } else | |
1e59de90 | 403 | TEST_EQ(res, cvt_type::ok); |
f67539c2 TL |
404 | |
405 | return std::string(to, to_next); | |
406 | } | |
407 | ||
408 | void test_codecvt_subst() | |
409 | { | |
410 | std::cout << "Substitutions " << std::endl; | |
411 | run_all(codecvt_to_wide, codecvt_to_narrow); | |
412 | } | |
413 | ||
1e59de90 | 414 | // coverity [root_function] |
f67539c2 TL |
415 | void test_main(int, char**, char**) |
416 | { | |
1e59de90 TL |
417 | test_codecvt_basic(); |
418 | test_codecvt_unshift(); | |
f67539c2 TL |
419 | test_codecvt_conv(); |
420 | test_codecvt_err(); | |
421 | test_codecvt_subst(); | |
422 | } |