]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
2 | // test_utf8_codecvt.cpp | |
3 | ||
4 | // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com . | |
5 | // Use, modification and distribution is subject to the Boost Software | |
6 | // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at | |
7 | // http://www.boost.org/LICENSE_1_0.txt) | |
8 | ||
9 | #include <algorithm> // std::copy | |
10 | #include <fstream> | |
11 | #include <iostream> | |
12 | #include <iterator> | |
13 | #include <locale> | |
14 | #include <vector> | |
15 | #include <string> | |
16 | ||
17 | #include <cstddef> // size_t | |
18 | #include <cwchar> | |
19 | #include <boost/config.hpp> | |
20 | ||
21 | #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail { | |
22 | #define BOOST_UTF8_END_NAMESPACE } } | |
23 | #include <boost/detail/utf8_codecvt_facet.hpp> | |
24 | #include <boost/detail/utf8_codecvt_facet.ipp> | |
25 | ||
26 | #if defined(BOOST_NO_STDC_NAMESPACE) | |
27 | namespace std{ | |
28 | using ::size_t; | |
29 | using ::wcslen; | |
30 | #if !defined(UNDER_CE) && !defined(__PGIC__) | |
31 | using ::w_int; | |
32 | #endif | |
33 | } // namespace std | |
34 | #endif | |
35 | ||
36 | // Note: copied from boost/iostreams/char_traits.hpp | |
37 | // | |
38 | // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines | |
39 | // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does | |
40 | // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope. | |
41 | // NOTE: Use BOOST_WORKAROUND? | |
42 | #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \ | |
43 | || defined(__SUNPRO_CC) | |
44 | using ::std::wint_t; | |
45 | #endif | |
46 | ||
47 | #include <boost/core/lightweight_test.hpp> | |
48 | ||
49 | template<std::size_t s> | |
50 | struct test_data | |
51 | { | |
52 | static unsigned char utf8_encoding[]; | |
53 | static wchar_t wchar_encoding[]; | |
54 | }; | |
55 | ||
56 | template<> | |
57 | unsigned char test_data<2>::utf8_encoding[] = { | |
58 | 0x01, | |
59 | 0x7f, | |
60 | 0xc2, 0x80, | |
61 | 0xdf, 0xbf, | |
62 | 0xe0, 0xa0, 0x80, | |
63 | 0xe7, 0xbf, 0xbf | |
64 | }; | |
65 | ||
66 | template<> | |
67 | wchar_t test_data<2>::wchar_encoding[] = { | |
68 | 0x0001, | |
69 | 0x007f, | |
70 | 0x0080, | |
71 | 0x07ff, | |
72 | 0x0800, | |
73 | 0x7fff | |
74 | }; | |
75 | ||
76 | template<> | |
77 | unsigned char test_data<4>::utf8_encoding[] = { | |
78 | 0x01, | |
79 | 0x7f, | |
80 | 0xc2, 0x80, | |
81 | 0xdf, 0xbf, | |
82 | 0xe0, 0xa0, 0x80, | |
83 | 0xef, 0xbf, 0xbf, | |
84 | 0xf0, 0x90, 0x80, 0x80, | |
85 | 0xf4, 0x8f, 0xbf, 0xbf, | |
86 | /* codecvt implementations for clang and gcc don't handle more than 21 bits and | |
87 | * return eof accordlingly. So don't test the whole 32 range | |
88 | */ | |
89 | /* | |
90 | 0xf7, 0xbf, 0xbf, 0xbf, | |
91 | 0xf8, 0x88, 0x80, 0x80, 0x80, | |
92 | 0xfb, 0xbf, 0xbf, 0xbf, 0xbf, | |
93 | 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80, | |
94 | 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf | |
95 | */ | |
96 | }; | |
97 | ||
98 | template<> | |
99 | wchar_t test_data<4>::wchar_encoding[] = { | |
100 | (wchar_t)0x00000001, | |
101 | (wchar_t)0x0000007f, | |
102 | (wchar_t)0x00000080, | |
103 | (wchar_t)0x000007ff, | |
104 | (wchar_t)0x00000800, | |
105 | (wchar_t)0x0000ffff, | |
106 | (wchar_t)0x00010000, | |
107 | (wchar_t)0x0010ffff, | |
108 | /* codecvt implementations for clang and gcc don't handle more than 21 bits and | |
109 | * return eof accordlingly. So don't test the whole 32 range | |
110 | */ | |
111 | /* | |
112 | (wchar_t)0x001fffff, | |
113 | (wchar_t)0x00200000, | |
114 | (wchar_t)0x03ffffff, | |
115 | (wchar_t)0x04000000, | |
116 | (wchar_t)0x7fffffff | |
117 | */ | |
118 | }; | |
119 | ||
120 | int | |
121 | test_main(int /* argc */, char * /* argv */[]) { | |
122 | std::locale utf8_locale | |
123 | = std::locale( | |
124 | std::locale::classic(), | |
125 | new boost::detail::utf8_codecvt_facet | |
126 | ); | |
127 | ||
128 | typedef char utf8_t; | |
129 | // define test data compatible with the wchar_t implementation | |
130 | // as either ucs-2 or ucs-4 depending on the compiler/library. | |
131 | typedef test_data<sizeof(wchar_t)> td; | |
132 | ||
133 | // Send our test UTF-8 data to file | |
134 | { | |
135 | std::ofstream ofs; | |
136 | ofs.open("test.dat"); | |
137 | std::copy( | |
138 | td::utf8_encoding, | |
139 | td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char), | |
140 | std::ostream_iterator<utf8_t>(ofs) | |
141 | ); | |
142 | } | |
143 | ||
144 | // Read the test data back in, converting to UCS-4 on the way in | |
145 | std::vector<wchar_t> from_file; | |
146 | { | |
147 | std::wifstream ifs; | |
148 | ifs.imbue(utf8_locale); | |
149 | ifs.open("test.dat"); | |
150 | ||
151 | std::wint_t item = 0; | |
152 | // note can't use normal vector from iterator constructor because | |
153 | // dinkumware doesn't have it. | |
154 | for(;;){ | |
155 | item = ifs.get(); | |
156 | if(item == WEOF) | |
157 | break; | |
158 | //ifs >> item; | |
159 | //if(ifs.eof()) | |
160 | // break; | |
161 | from_file.push_back(item); | |
162 | } | |
163 | } | |
164 | ||
165 | BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding)); | |
166 | ||
167 | // Send the UCS4_data back out, converting to UTF-8 | |
168 | { | |
169 | std::wofstream ofs; | |
170 | ofs.imbue(utf8_locale); | |
171 | ofs.open("test2.dat"); | |
172 | std::copy( | |
173 | from_file.begin(), | |
174 | from_file.end(), | |
175 | std::ostream_iterator<wchar_t, wchar_t>(ofs) | |
176 | ); | |
177 | } | |
178 | ||
179 | // Make sure that both files are the same | |
180 | { | |
181 | typedef std::istream_iterator<utf8_t> is_iter; | |
182 | is_iter end_iter; | |
183 | ||
184 | std::ifstream ifs1("test.dat"); | |
185 | is_iter it1(ifs1); | |
186 | std::vector<utf8_t> data1; | |
187 | std::copy(it1, end_iter, std::back_inserter(data1)); | |
188 | ||
189 | std::ifstream ifs2("test2.dat"); | |
190 | is_iter it2(ifs2); | |
191 | std::vector<utf8_t> data2; | |
192 | std::copy(it2, end_iter, std::back_inserter(data2)); | |
193 | ||
194 | BOOST_TEST(data1 == data2); | |
195 | } | |
196 | ||
197 | // some libraries have trouble that only shows up with longer strings | |
198 | ||
199 | const wchar_t * test3_data = L"\ | |
200 | <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\ | |
201 | <!DOCTYPE boost_serialization>\ | |
202 | <boost_serialization signature=\"serialization::archive\" version=\"3\">\ | |
203 | <a class_id=\"0\" tracking_level=\"0\">\ | |
204 | <b>1</b>\ | |
205 | <f>96953204</f>\ | |
206 | <g>177129195</g>\ | |
207 | <l>1</l>\ | |
208 | <m>5627</m>\ | |
209 | <n>23010</n>\ | |
210 | <o>7419</o>\ | |
211 | <p>16212</p>\ | |
212 | <q>4086</q>\ | |
213 | <r>2749</r>\ | |
214 | <c>-33</c>\ | |
215 | <s>124</s>\ | |
216 | <t>28</t>\ | |
217 | <u>32225</u>\ | |
218 | <v>17543</v>\ | |
219 | <w>0.84431422</w>\ | |
220 | <x>1.0170664757130923</x>\ | |
221 | <y>tjbx</y>\ | |
222 | <z>cuwjentqpkejp</z>\ | |
223 | </a>\ | |
224 | </boost_serialization>\ | |
225 | "; | |
226 | ||
227 | // Send the UCS4_data back out, converting to UTF-8 | |
228 | std::size_t l = std::wcslen(test3_data); | |
229 | { | |
230 | std::wofstream ofs; | |
231 | ofs.imbue(utf8_locale); | |
232 | ofs.open("test3.dat"); | |
233 | std::copy( | |
234 | test3_data, | |
235 | test3_data + l, | |
236 | std::ostream_iterator<wchar_t, wchar_t>(ofs) | |
237 | ); | |
238 | } | |
239 | ||
240 | // Make sure that both files are the same | |
241 | { | |
242 | std::wifstream ifs; | |
243 | ifs.imbue(utf8_locale); | |
244 | ifs.open("test3.dat"); | |
245 | ifs >> std::noskipws; | |
246 | BOOST_TEST( | |
247 | std::equal( | |
248 | test3_data, | |
249 | test3_data + l, | |
250 | std::istream_iterator<wchar_t, wchar_t>(ifs) | |
251 | ) | |
252 | ); | |
253 | } | |
254 | ||
255 | // Test length calculation | |
256 | { | |
257 | std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale); | |
258 | std::mbstate_t mbs = std::mbstate_t(); | |
259 | const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding); | |
260 | int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u)); | |
261 | BOOST_TEST_EQ(utf8_len, res); | |
262 | } | |
263 | ||
b32b8144 FG |
264 | // Test that length calculation detects character boundaries |
265 | { | |
266 | std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale); | |
267 | std::mbstate_t mbs = std::mbstate_t(); | |
268 | // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character. | |
269 | // This last byte should not be accounted by length(). | |
270 | const int input_len = 5; | |
271 | const int utf8_len = 4; | |
272 | int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u)); | |
273 | BOOST_TEST_EQ(utf8_len, res); | |
274 | } | |
275 | ||
7c673cae FG |
276 | return EXIT_SUCCESS; |
277 | } | |
278 | ||
279 | int | |
280 | main(int argc, char * argv[]){ | |
281 | ||
282 | int retval = 1; | |
283 | BOOST_TRY{ | |
284 | retval = test_main(argc, argv); | |
285 | } | |
286 | #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE | |
287 | BOOST_CATCH(const std::exception & e){ | |
288 | BOOST_ERROR(e.what()); | |
289 | } | |
290 | #endif | |
291 | BOOST_CATCH(...){ | |
292 | BOOST_ERROR("failed with uncaught exception:"); | |
293 | } | |
294 | BOOST_CATCH_END | |
295 | ||
296 | int error_count = boost::report_errors(); | |
297 | if(error_count > 0) | |
298 | retval = error_count; | |
299 | return retval; | |
300 | } | |
301 |