]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | #define BOOST_LOCALE_SOURCE | |
9 | #include <boost/locale/generator.hpp> | |
10 | #include <boost/locale/encoding.hpp> | |
11 | #include <boost/locale/utf8_codecvt.hpp> | |
12 | ||
13 | #include "../encoding/conv.hpp" | |
14 | ||
15 | #include <boost/locale/util.hpp> | |
16 | ||
17 | #ifdef BOOST_MSVC | |
18 | # pragma warning(disable : 4244 4996) // loose data | |
19 | #endif | |
20 | ||
21 | #include <cstddef> | |
22 | #include <string.h> | |
23 | #include <vector> | |
24 | #include <algorithm> | |
25 | ||
26 | //#define DEBUG_CODECVT | |
27 | ||
28 | #ifdef DEBUG_CODECVT | |
29 | #include <iostream> | |
30 | #endif | |
31 | ||
32 | namespace boost { | |
33 | namespace locale { | |
34 | namespace util { | |
35 | ||
36 | class utf8_converter : public base_converter { | |
37 | public: | |
38 | virtual int max_len() const | |
39 | { | |
40 | return 4; | |
41 | } | |
42 | ||
43 | virtual utf8_converter *clone() const | |
44 | { | |
45 | return new utf8_converter(); | |
46 | } | |
47 | ||
48 | bool is_thread_safe() const | |
49 | { | |
50 | return true; | |
51 | } | |
52 | ||
53 | virtual uint32_t to_unicode(char const *&begin,char const *end) | |
54 | { | |
55 | char const *p=begin; | |
56 | ||
57 | utf::code_point c = utf::utf_traits<char>::decode(p,end); | |
58 | ||
59 | if(c==utf::illegal) | |
60 | return illegal; | |
61 | ||
62 | if(c==utf::incomplete) | |
63 | return incomplete; | |
64 | ||
65 | begin = p; | |
66 | return c; | |
67 | } | |
68 | ||
69 | virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) | |
70 | { | |
71 | if(!utf::is_valid_codepoint(u)) | |
72 | return illegal; | |
73 | int width = utf::utf_traits<char>::width(u); | |
74 | std::ptrdiff_t d=end-begin; | |
75 | if(d < width) | |
76 | return incomplete; | |
77 | utf::utf_traits<char>::encode(u,begin); | |
78 | return width; | |
79 | } | |
80 | }; // utf8_converter | |
81 | ||
82 | class simple_converter_impl { | |
83 | public: | |
84 | ||
85 | static const int hash_table_size = 1024; | |
86 | ||
87 | simple_converter_impl(std::string const &encoding) | |
88 | { | |
89 | for(unsigned i=0;i<128;i++) | |
90 | to_unicode_tbl_[i]=i; | |
91 | for(unsigned i=128;i<256;i++) { | |
92 | char buf[2] = { char(i) , 0 }; | |
93 | uint32_t uchar=utf::illegal; | |
94 | try { | |
95 | std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop); | |
96 | if(tmp.size() == 1) { | |
97 | uchar = tmp[0]; | |
98 | } | |
99 | else { | |
100 | uchar = utf::illegal; | |
101 | } | |
102 | } | |
103 | catch(conv::conversion_error const &/*e*/) { | |
104 | uchar = utf::illegal; | |
105 | } | |
106 | to_unicode_tbl_[i]=uchar; | |
107 | } | |
108 | for(int i=0;i<hash_table_size;i++) | |
109 | from_unicode_tbl_[i]=0; | |
110 | for(unsigned i=1;i<256;i++) { | |
111 | if(to_unicode_tbl_[i]!=utf::illegal) { | |
112 | unsigned pos = to_unicode_tbl_[i] % hash_table_size; | |
113 | while(from_unicode_tbl_[pos]!=0) | |
114 | pos = (pos + 1) % hash_table_size; | |
115 | from_unicode_tbl_[pos] = i; | |
116 | } | |
117 | } | |
118 | } | |
119 | ||
120 | uint32_t to_unicode(char const *&begin,char const *end) const | |
121 | { | |
122 | if(begin==end) | |
123 | return utf::incomplete; | |
124 | unsigned char c = *begin++; | |
125 | return to_unicode_tbl_[c]; | |
126 | } | |
127 | uint32_t from_unicode(uint32_t u,char *begin,char const *end) const | |
128 | { | |
129 | if(begin==end) | |
130 | return utf::incomplete; | |
131 | if(u==0) { | |
132 | *begin = 0; | |
133 | return 1; | |
134 | } | |
135 | unsigned pos = u % hash_table_size; | |
136 | unsigned char c; | |
137 | while((c=from_unicode_tbl_[pos])!=0 && to_unicode_tbl_[c]!=u) | |
138 | pos = (pos + 1) % hash_table_size; | |
139 | if(c==0) | |
140 | return utf::illegal; | |
141 | *begin = c; | |
142 | return 1; | |
143 | } | |
144 | private: | |
145 | uint32_t to_unicode_tbl_[256]; | |
146 | unsigned char from_unicode_tbl_[hash_table_size]; | |
147 | }; | |
148 | ||
149 | class simple_converter : public base_converter { | |
150 | public: | |
151 | ||
152 | virtual ~simple_converter() | |
153 | { | |
154 | } | |
155 | ||
156 | simple_converter(std::string const &encoding) : | |
157 | cvt_(encoding) | |
158 | { | |
159 | } | |
160 | ||
161 | virtual int max_len() const | |
162 | { | |
163 | return 1; | |
164 | } | |
165 | ||
166 | virtual bool is_thread_safe() const | |
167 | { | |
168 | return true; | |
169 | } | |
170 | virtual base_converter *clone() const | |
171 | { | |
172 | return new simple_converter(*this); | |
173 | } | |
174 | ||
175 | virtual uint32_t to_unicode(char const *&begin,char const *end) | |
176 | { | |
177 | return cvt_.to_unicode(begin,end); | |
178 | } | |
179 | virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) | |
180 | { | |
181 | return cvt_.from_unicode(u,begin,end); | |
182 | } | |
183 | private: | |
184 | simple_converter_impl cvt_; | |
185 | }; | |
186 | ||
187 | template<typename CharType> | |
188 | class simple_codecvt : public generic_codecvt<CharType,simple_codecvt<CharType> > | |
189 | { | |
190 | public: | |
191 | ||
192 | simple_codecvt(std::string const &encoding,size_t refs = 0) : | |
193 | generic_codecvt<CharType,simple_codecvt<CharType> >(refs), | |
194 | cvt_(encoding) | |
195 | { | |
196 | } | |
197 | ||
198 | struct state_type {}; | |
199 | static state_type initial_state(generic_codecvt_base::initial_convertion_state /* unused */) | |
200 | { | |
201 | return state_type(); | |
202 | } | |
203 | static int max_encoding_length() | |
204 | { | |
205 | return 1; | |
206 | } | |
207 | ||
208 | utf::code_point to_unicode(state_type &,char const *&begin,char const *end) const | |
209 | { | |
210 | return cvt_.to_unicode(begin,end); | |
211 | } | |
212 | ||
213 | utf::code_point from_unicode(state_type &,utf::code_point u,char *begin,char const *end) const | |
214 | { | |
215 | return cvt_.from_unicode(u,begin,end); | |
216 | } | |
217 | private: | |
218 | simple_converter_impl cvt_; | |
219 | ||
220 | }; | |
221 | ||
222 | namespace { | |
223 | char const *simple_encoding_table[] = { | |
224 | "cp1250", | |
225 | "cp1251", | |
226 | "cp1252", | |
227 | "cp1253", | |
228 | "cp1254", | |
229 | "cp1255", | |
230 | "cp1256", | |
231 | "cp1257", | |
232 | "iso88591", | |
233 | "iso885913", | |
234 | "iso885915", | |
235 | "iso88592", | |
236 | "iso88593", | |
237 | "iso88594", | |
238 | "iso88595", | |
239 | "iso88596", | |
240 | "iso88597", | |
241 | "iso88598", | |
242 | "iso88599", | |
243 | "koi8r", | |
244 | "koi8u", | |
245 | "usascii", | |
246 | "windows1250", | |
247 | "windows1251", | |
248 | "windows1252", | |
249 | "windows1253", | |
250 | "windows1254", | |
251 | "windows1255", | |
252 | "windows1256", | |
253 | "windows1257" | |
254 | }; | |
255 | ||
256 | bool compare_strings(char const *l,char const *r) | |
257 | { | |
258 | return strcmp(l,r) < 0; | |
259 | } | |
260 | } | |
261 | ||
262 | bool check_is_simple_encoding(std::string const &encoding) | |
263 | { | |
264 | std::string norm = conv::impl::normalize_encoding(encoding.c_str()); | |
265 | return std::binary_search<char const **>( simple_encoding_table, | |
266 | simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *), | |
267 | norm.c_str(), | |
268 | compare_strings); | |
269 | return 0; | |
270 | } | |
271 | ||
11fdf7f2 TL |
272 | #if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR) |
273 | std::auto_ptr<base_converter> create_utf8_converter() | |
274 | { | |
275 | std::auto_ptr<base_converter> res(create_utf8_converter_new_ptr()); | |
276 | return res; | |
277 | } | |
7c673cae FG |
278 | std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding) |
279 | { | |
11fdf7f2 | 280 | std::auto_ptr<base_converter> res(create_simple_converter_new_ptr(encoding)); |
7c673cae FG |
281 | return res; |
282 | } | |
11fdf7f2 TL |
283 | std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type) |
284 | { | |
285 | return create_codecvt_from_pointer(in,cvt.release(),type); | |
286 | } | |
287 | #endif | |
288 | #ifndef BOOST_NO_CXX11_SMART_PTR | |
289 | std::unique_ptr<base_converter> create_utf8_converter_unique_ptr() | |
290 | { | |
291 | std::unique_ptr<base_converter> res(create_utf8_converter_new_ptr()); | |
292 | return res; | |
293 | } | |
294 | std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding) | |
7c673cae | 295 | { |
11fdf7f2 | 296 | std::unique_ptr<base_converter> res(create_simple_converter_new_ptr(encoding)); |
7c673cae FG |
297 | return res; |
298 | } | |
11fdf7f2 TL |
299 | std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type) |
300 | { | |
301 | return create_codecvt_from_pointer(in,cvt.release(),type); | |
302 | } | |
303 | #endif | |
304 | ||
305 | base_converter *create_simple_converter_new_ptr(std::string const &encoding) | |
306 | { | |
307 | if(check_is_simple_encoding(encoding)) | |
308 | return new simple_converter(encoding); | |
309 | return 0; | |
310 | } | |
311 | ||
312 | base_converter *create_utf8_converter_new_ptr() | |
313 | { | |
314 | return new utf8_converter(); | |
315 | } | |
7c673cae FG |
316 | |
317 | template<typename CharType> | |
318 | class code_converter : public generic_codecvt<CharType,code_converter<CharType> > | |
319 | { | |
320 | public: | |
11fdf7f2 TL |
321 | #ifndef BOOST_NO_CXX11_SMART_PTR |
322 | typedef std::unique_ptr<base_converter> base_converter_ptr; | |
323 | #define PTR_TRANS(x) std::move((x)) | |
324 | #else | |
325 | typedef std::auto_ptr<base_converter> base_converter_ptr; | |
326 | #define PTR_TRANS(x) (x) | |
327 | #endif | |
328 | typedef base_converter_ptr state_type; | |
329 | ||
330 | code_converter(base_converter_ptr cvt,size_t refs = 0) : | |
7c673cae | 331 | generic_codecvt<CharType,code_converter<CharType> >(refs), |
11fdf7f2 | 332 | cvt_(PTR_TRANS(cvt)) |
7c673cae FG |
333 | { |
334 | max_len_ = cvt_->max_len(); | |
335 | thread_safe_ = cvt_->is_thread_safe(); | |
336 | } | |
337 | ||
7c673cae FG |
338 | |
339 | int max_encoding_length() const | |
340 | { | |
341 | return max_len_; | |
342 | } | |
343 | ||
11fdf7f2 | 344 | base_converter_ptr initial_state(generic_codecvt_base::initial_convertion_state /* unused */) const |
7c673cae | 345 | { |
11fdf7f2 | 346 | base_converter_ptr r; |
7c673cae FG |
347 | if(!thread_safe_) |
348 | r.reset(cvt_->clone()); | |
349 | return r; | |
350 | } | |
351 | ||
11fdf7f2 | 352 | utf::code_point to_unicode(base_converter_ptr &ptr,char const *&begin,char const *end) const |
7c673cae FG |
353 | { |
354 | if(thread_safe_) | |
355 | return cvt_->to_unicode(begin,end); | |
356 | else | |
357 | return ptr->to_unicode(begin,end); | |
358 | } | |
359 | ||
11fdf7f2 | 360 | utf::code_point from_unicode(base_converter_ptr &ptr,utf::code_point u,char *begin,char const *end) const |
7c673cae FG |
361 | { |
362 | if(thread_safe_) | |
363 | return cvt_->from_unicode(u,begin,end); | |
364 | else | |
365 | return ptr->from_unicode(u,begin,end); | |
366 | } | |
367 | ||
368 | private: | |
11fdf7f2 | 369 | base_converter_ptr cvt_; |
7c673cae FG |
370 | int max_len_; |
371 | bool thread_safe_; | |
372 | }; | |
373 | ||
374 | ||
11fdf7f2 | 375 | std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *pcvt,character_facet_type type) |
7c673cae | 376 | { |
11fdf7f2 | 377 | code_converter<char>::base_converter_ptr cvt(pcvt); |
7c673cae FG |
378 | if(!cvt.get()) |
379 | cvt.reset(new base_converter()); | |
380 | switch(type) { | |
381 | case char_facet: | |
11fdf7f2 | 382 | return std::locale(in,new code_converter<char>(PTR_TRANS(cvt))); |
7c673cae | 383 | case wchar_t_facet: |
11fdf7f2 | 384 | return std::locale(in,new code_converter<wchar_t>(PTR_TRANS(cvt))); |
7c673cae FG |
385 | #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) |
386 | case char16_t_facet: | |
11fdf7f2 | 387 | return std::locale(in,new code_converter<char16_t>(PTR_TRANS(cvt))); |
7c673cae FG |
388 | #endif |
389 | #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) | |
390 | case char32_t_facet: | |
11fdf7f2 | 391 | return std::locale(in,new code_converter<char32_t>(PTR_TRANS(cvt))); |
7c673cae FG |
392 | #endif |
393 | default: | |
394 | return in; | |
395 | } | |
396 | } | |
397 | ||
398 | ||
399 | /// | |
400 | /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return | |
401 | /// new locale that is based on \a in and uses new facet. | |
402 | /// | |
403 | std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type) | |
404 | { | |
405 | switch(type) { | |
406 | case char_facet: | |
407 | return std::locale(in,new utf8_codecvt<char>()); | |
408 | case wchar_t_facet: | |
409 | return std::locale(in,new utf8_codecvt<wchar_t>()); | |
410 | #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) | |
411 | case char16_t_facet: | |
412 | return std::locale(in,new utf8_codecvt<char16_t>()); | |
413 | #endif | |
414 | #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) | |
415 | case char32_t_facet: | |
416 | return std::locale(in,new utf8_codecvt<char32_t>()); | |
417 | #endif | |
418 | default: | |
419 | return in; | |
420 | } | |
421 | } | |
422 | ||
423 | /// | |
424 | /// This function installs codecvt that can be used for conversion between single byte | |
425 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, | |
426 | /// | |
427 | /// Throws invalid_charset_error if the chacater set is not supported or isn't single byte character | |
428 | /// set | |
429 | std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type) | |
430 | { | |
431 | if(!check_is_simple_encoding(encoding)) | |
432 | throw boost::locale::conv::invalid_charset_error("Invalid simple encoding " + encoding); | |
433 | ||
434 | switch(type) { | |
435 | case char_facet: | |
436 | return std::locale(in,new simple_codecvt<char>(encoding)); | |
437 | case wchar_t_facet: | |
438 | return std::locale(in,new simple_codecvt<wchar_t>(encoding)); | |
439 | #if defined(BOOST_LOCALE_ENABLE_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT) | |
440 | case char16_t_facet: | |
441 | return std::locale(in,new simple_codecvt<char16_t>(encoding)); | |
442 | #endif | |
443 | #if defined(BOOST_LOCALE_ENABLE_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT) | |
444 | case char32_t_facet: | |
445 | return std::locale(in,new simple_codecvt<char32_t>(encoding)); | |
446 | #endif | |
447 | default: | |
448 | return in; | |
449 | } | |
450 | } | |
451 | ||
452 | ||
453 | ||
454 | } // util | |
455 | } // locale | |
456 | } // boost | |
457 | ||
458 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 |