]>
Commit | Line | Data |
---|---|---|
1 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
2 | // test_utf8_codecvt.cpp | |
3 | ||
4 | // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com . | |
5 | // Use, modification and distribution is subject to the Boost Software | |
6 | // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at | |
7 | // http://www.boost.org/LICENSE_1_0.txt) | |
8 | ||
9 | #include <algorithm> // std::copy | |
10 | #include <fstream> | |
11 | #include <iostream> | |
12 | #include <iterator> | |
13 | #include <locale> | |
14 | #include <vector> | |
15 | #include <string> | |
16 | ||
17 | #include <cstddef> // size_t | |
18 | #include <cwchar> | |
19 | #include <boost/config.hpp> | |
20 | ||
21 | #include <boost/archive/detail/utf8_codecvt_facet.hpp> | |
22 | ||
23 | #if defined(BOOST_NO_STDC_NAMESPACE) | |
24 | namespace std{ | |
25 | using ::size_t; | |
26 | using ::wcslen; | |
27 | #if !defined(UNDER_CE) && !defined(__PGIC__) | |
28 | using ::w_int; | |
29 | #endif | |
30 | } // namespace std | |
31 | #endif | |
32 | ||
33 | // Note: copied from boost/iostreams/char_traits.hpp | |
34 | // | |
35 | // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines | |
36 | // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does | |
37 | // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope. | |
38 | // NOTE: Use BOOST_WORKAROUND? | |
39 | #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \ | |
40 | || defined(__SUNPRO_CC) | |
41 | using ::std::wint_t; | |
42 | #endif | |
43 | ||
44 | #include "test_tools.hpp" | |
45 | ||
46 | template<std::size_t s> | |
47 | struct test_data | |
48 | { | |
49 | static unsigned char utf8_encoding[]; | |
50 | static wchar_t wchar_encoding[]; | |
51 | }; | |
52 | ||
53 | template<> | |
54 | unsigned char test_data<2>::utf8_encoding[] = { | |
55 | 0x01, | |
56 | 0x7f, | |
57 | 0xc2, 0x80, | |
58 | 0xdf, 0xbf, | |
59 | 0xe0, 0xa0, 0x80, | |
60 | 0xe7, 0xbf, 0xbf | |
61 | }; | |
62 | ||
63 | template<> | |
64 | wchar_t test_data<2>::wchar_encoding[] = { | |
65 | 0x0001, | |
66 | 0x007f, | |
67 | 0x0080, | |
68 | 0x07ff, | |
69 | 0x0800, | |
70 | 0x7fff | |
71 | }; | |
72 | ||
73 | template<> | |
74 | unsigned char test_data<4>::utf8_encoding[] = { | |
75 | 0x01, | |
76 | 0x7f, | |
77 | 0xc2, 0x80, | |
78 | 0xdf, 0xbf, | |
79 | 0xe0, 0xa0, 0x80, | |
80 | 0xef, 0xbf, 0xbf, | |
81 | 0xf0, 0x90, 0x80, 0x80, | |
82 | 0xf4, 0x8f, 0xbf, 0xbf, | |
83 | /* codecvt implementations for clang and gcc don't handle more than 21 bits and | |
84 | * return eof accordlingly. So don't test the whole 32 range | |
85 | */ | |
86 | /* | |
87 | 0xf7, 0xbf, 0xbf, 0xbf, | |
88 | 0xf8, 0x88, 0x80, 0x80, 0x80, | |
89 | 0xfb, 0xbf, 0xbf, 0xbf, 0xbf, | |
90 | 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80, | |
91 | 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf | |
92 | */ | |
93 | }; | |
94 | ||
95 | template<> | |
96 | wchar_t test_data<4>::wchar_encoding[] = { | |
97 | (wchar_t)0x00000001, | |
98 | (wchar_t)0x0000007f, | |
99 | (wchar_t)0x00000080, | |
100 | (wchar_t)0x000007ff, | |
101 | (wchar_t)0x00000800, | |
102 | (wchar_t)0x0000ffff, | |
103 | (wchar_t)0x00010000, | |
104 | (wchar_t)0x0010ffff, | |
105 | /* codecvt implementations for clang and gcc don't handle more than 21 bits and | |
106 | * return eof accordlingly. So don't test the whole 32 range | |
107 | */ | |
108 | /* | |
109 | (wchar_t)0x001fffff, | |
110 | (wchar_t)0x00200000, | |
111 | (wchar_t)0x03ffffff, | |
112 | (wchar_t)0x04000000, | |
113 | (wchar_t)0x7fffffff | |
114 | */ | |
115 | }; | |
116 | ||
117 | int | |
118 | test_main(int /* argc */, char * /* argv */[]) { | |
119 | std::locale utf8_locale | |
120 | = std::locale( | |
121 | std::locale::classic(), | |
122 | new boost::archive::detail::utf8_codecvt_facet | |
123 | ); | |
124 | ||
125 | typedef char utf8_t; | |
126 | // define test data compatible with the wchar_t implementation | |
127 | // as either ucs-2 or ucs-4 depending on the compiler/library. | |
128 | typedef test_data<sizeof(wchar_t)> td; | |
129 | ||
130 | // Send our test UTF-8 data to file | |
131 | { | |
132 | std::ofstream ofs; | |
133 | ofs.open("test.dat"); | |
134 | std::copy( | |
135 | td::utf8_encoding, | |
136 | td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char), | |
137 | std::ostream_iterator<utf8_t>(ofs) | |
138 | ); | |
139 | } | |
140 | ||
141 | // Read the test data back in, converting to UCS-4 on the way in | |
142 | std::vector<wchar_t> from_file; | |
143 | { | |
144 | std::wifstream ifs; | |
145 | ifs.imbue(utf8_locale); | |
146 | ifs.open("test.dat"); | |
147 | ||
148 | std::wint_t item = 0; | |
149 | // note can't use normal vector from iterator constructor because | |
150 | // dinkumware doesn't have it. | |
151 | for(;;){ | |
152 | item = ifs.get(); | |
153 | if(item == WEOF) | |
154 | break; | |
155 | //ifs >> item; | |
156 | //if(ifs.eof()) | |
157 | // break; | |
158 | from_file.push_back(item); | |
159 | } | |
160 | } | |
161 | ||
162 | BOOST_CHECK(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding)); | |
163 | ||
164 | // Send the UCS4_data back out, converting to UTF-8 | |
165 | { | |
166 | std::wofstream ofs; | |
167 | ofs.imbue(utf8_locale); | |
168 | ofs.open("test2.dat"); | |
169 | std::copy( | |
170 | from_file.begin(), | |
171 | from_file.end(), | |
172 | std::ostream_iterator<wchar_t, wchar_t>(ofs) | |
173 | ); | |
174 | } | |
175 | ||
176 | // Make sure that both files are the same | |
177 | { | |
178 | typedef std::istream_iterator<utf8_t> is_iter; | |
179 | is_iter end_iter; | |
180 | ||
181 | std::ifstream ifs1("test.dat"); | |
182 | is_iter it1(ifs1); | |
183 | std::vector<utf8_t> data1; | |
184 | std::copy(it1, end_iter, std::back_inserter(data1)); | |
185 | ||
186 | std::ifstream ifs2("test2.dat"); | |
187 | is_iter it2(ifs2); | |
188 | std::vector<utf8_t> data2; | |
189 | std::copy(it2, end_iter, std::back_inserter(data2)); | |
190 | ||
191 | BOOST_CHECK(data1 == data2); | |
192 | } | |
193 | ||
194 | // some libraries have trouble that only shows up with longer strings | |
195 | ||
196 | const wchar_t * test3_data = L"\ | |
197 | <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\ | |
198 | <!DOCTYPE boost_serialization>\ | |
199 | <boost_serialization signature=\"serialization::archive\" version=\"3\">\ | |
200 | <a class_id=\"0\" tracking_level=\"0\">\ | |
201 | <b>1</b>\ | |
202 | <f>96953204</f>\ | |
203 | <g>177129195</g>\ | |
204 | <l>1</l>\ | |
205 | <m>5627</m>\ | |
206 | <n>23010</n>\ | |
207 | <o>7419</o>\ | |
208 | <p>16212</p>\ | |
209 | <q>4086</q>\ | |
210 | <r>2749</r>\ | |
211 | <c>-33</c>\ | |
212 | <s>124</s>\ | |
213 | <t>28</t>\ | |
214 | <u>32225</u>\ | |
215 | <v>17543</v>\ | |
216 | <w>0.84431422</w>\ | |
217 | <x>1.0170664757130923</x>\ | |
218 | <y>tjbx</y>\ | |
219 | <z>cuwjentqpkejp</z>\ | |
220 | </a>\ | |
221 | </boost_serialization>\ | |
222 | "; | |
223 | ||
224 | // Send the UCS4_data back out, converting to UTF-8 | |
225 | std::size_t l = std::wcslen(test3_data); | |
226 | { | |
227 | std::wofstream ofs; | |
228 | ofs.imbue(utf8_locale); | |
229 | ofs.open("test3.dat"); | |
230 | std::copy( | |
231 | test3_data, | |
232 | test3_data + l, | |
233 | std::ostream_iterator<wchar_t, wchar_t>(ofs) | |
234 | ); | |
235 | } | |
236 | ||
237 | // Make sure that both files are the same | |
238 | { | |
239 | std::wifstream ifs; | |
240 | ifs.imbue(utf8_locale); | |
241 | ifs.open("test3.dat"); | |
242 | ifs >> std::noskipws; | |
243 | BOOST_CHECK( | |
244 | std::equal( | |
245 | test3_data, | |
246 | test3_data + l, | |
247 | std::istream_iterator<wchar_t, wchar_t>(ifs) | |
248 | ) | |
249 | ); | |
250 | } | |
251 | return EXIT_SUCCESS; | |
252 | } |