]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | #ifndef BOOST_LOCALE_UTIL_HPP | |
9 | #define BOOST_LOCALE_UTIL_HPP | |
10 | #include <locale> | |
11 | #include <typeinfo> | |
12 | #include <boost/cstdint.hpp> | |
13 | #include <boost/locale/utf.hpp> | |
14 | #include <boost/locale/generator.hpp> | |
15 | #include <boost/assert.hpp> | |
16 | ||
17 | #include <vector> | |
18 | namespace boost { | |
19 | namespace locale { | |
20 | /// | |
21 | /// \brief This namespace provides various utility function useful for Boost.Locale backends | |
22 | /// implementations | |
23 | /// | |
24 | namespace util { | |
25 | ||
26 | /// | |
27 | /// \brief Return default system locale name in POSIX format. | |
28 | /// | |
29 | /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment | |
30 | /// variables in this order and if all of them unset, in POSIX platforms it returns "C" | |
31 | /// | |
32 | /// On Windows additionally to check the above environment variables, this function | |
33 | /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined | |
34 | /// for user default locale. | |
35 | /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system | |
36 | /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs | |
37 | /// to UTF-8 encoding if ANSI code-page is not available. | |
38 | /// | |
39 | BOOST_LOCALE_DECL | |
40 | std::string get_system_locale(bool use_utf8_on_windows = false); | |
41 | ||
42 | /// | |
43 | /// \brief Installs information facet to locale in based on locale name \a name | |
44 | /// | |
45 | /// This function installs boost::locale::info facet into the locale \a in and returns | |
46 | /// newly created locale. | |
47 | /// | |
48 | /// Note: all information is based only on parsing of string \a name; | |
49 | /// | |
50 | /// The name has following format: language[_COUNTRY][.encoding][\@variant] | |
51 | /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166 | |
52 | /// country identifier like "US" or "RU". the Encoding is a charracter set name | |
53 | /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or | |
54 | /// calendar=hebrew. | |
55 | /// | |
56 | /// If some parameters are missing they are specified as blanks, default encoding | |
57 | /// is assumed to be US-ASCII and missing language is assumed to be "C" | |
58 | /// | |
59 | BOOST_LOCALE_DECL | |
60 | std::locale create_info(std::locale const &in,std::string const &name); | |
61 | ||
62 | ||
63 | /// | |
64 | /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for | |
65 | /// each single code point | |
66 | /// | |
67 | /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding | |
68 | /// to encoding supported by this converter | |
69 | /// | |
70 | /// Please note, this converter should be fully stateless. Fully stateless means it should | |
71 | /// never assume that it is called in any specific order on the text. Even if the | |
72 | /// encoding itself seems to be stateless like windows-1255 or shift-jis, some | |
73 | /// encoders (most notably iconv) can actually compose several code-point into one or | |
74 | /// decompose them in case composite characters are found. So be very careful when implementing | |
75 | /// these converters for certain character set. | |
76 | /// | |
77 | class base_converter { | |
78 | public: | |
79 | ||
80 | /// | |
81 | /// This value should be returned when an illegal input sequence or code-point is observed: | |
82 | /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates | |
83 | /// or an invalid UTF-8 sequence is found | |
84 | /// | |
85 | static const uint32_t illegal=utf::illegal; | |
86 | ||
87 | /// | |
88 | /// This value is returned in following cases: The of incomplete input sequence was found or | |
89 | /// insufficient output buffer was provided so complete output could not be written. | |
90 | /// | |
91 | static const uint32_t incomplete=utf::incomplete; | |
92 | ||
93 | virtual ~base_converter() | |
94 | { | |
95 | } | |
96 | /// | |
97 | /// Return the maximal length that one Unicode code-point can be converted to, for example | |
98 | /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1 | |
99 | /// | |
100 | virtual int max_len() const | |
101 | { | |
102 | return 1; | |
103 | } | |
104 | /// | |
105 | /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe. | |
106 | /// | |
107 | /// Rule of thumb: if this class' implementation uses simple tables that are unchanged | |
108 | /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for | |
109 | /// independent to_unicode, from_unicode calls, you may set it to true, otherwise, | |
110 | /// for example if you use iconv_t descriptor or UConverter as conversion object return false, | |
111 | /// and this object will be cloned for each use. | |
112 | /// | |
113 | virtual bool is_thread_safe() const | |
114 | { | |
115 | return false; | |
116 | } | |
117 | /// | |
118 | /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false | |
119 | /// | |
120 | virtual base_converter *clone() const | |
121 | { | |
122 | BOOST_ASSERT(typeid(*this)==typeid(base_converter)); | |
123 | return new base_converter(); | |
124 | } | |
125 | ||
126 | /// | |
127 | /// Convert a single character starting at begin and ending at most at end to Unicode code-point. | |
128 | /// | |
129 | /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end | |
130 | /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end | |
131 | /// | |
132 | /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end | |
133 | /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example | |
134 | /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation. | |
135 | /// | |
136 | /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end | |
137 | /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end | |
138 | /// for UTF-8, then \a illegal is returned. | |
139 | /// | |
140 | /// | |
141 | virtual uint32_t to_unicode(char const *&begin,char const *end) | |
142 | { | |
143 | if(begin == end) | |
144 | return incomplete; | |
145 | unsigned char cp = *begin; | |
146 | if(cp <= 0x7F) { | |
147 | begin++; | |
148 | return cp; | |
149 | } | |
150 | return illegal; | |
151 | } | |
152 | /// | |
153 | /// Convert a single code-point \a u into encoding and store it in [begin,end) range. | |
154 | /// | |
155 | /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set, | |
156 | /// \a illegal should be returned | |
157 | /// | |
158 | /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then | |
159 | /// | |
160 | /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned | |
161 | /// -# If end - begin < N, incomplete is returned, it is unspecified what would be | |
162 | /// stored in bytes in range [begin,end) | |
163 | ||
164 | virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) | |
165 | { | |
166 | if(begin==end) | |
167 | return incomplete; | |
168 | if(u >= 0x80) | |
169 | return illegal; | |
170 | *begin = static_cast<char>(u); | |
171 | return 1; | |
172 | } | |
173 | }; | |
174 | ||
11fdf7f2 | 175 | #if !defined(BOOST_LOCALE_HIDE_AUTO_PTR) && !defined(BOOST_NO_AUTO_PTR) |
7c673cae FG |
176 | /// |
177 | /// This function creates a \a base_converter that can be used for conversion between UTF-8 and | |
178 | /// unicode code points | |
179 | /// | |
180 | BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter(); | |
181 | /// | |
182 | /// This function creates a \a base_converter that can be used for conversion between single byte | |
183 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, | |
184 | /// | |
185 | /// If \a encoding is not supported, empty pointer is returned. You should check if | |
186 | /// std::auto_ptr<base_converter>::get() != 0 | |
187 | /// | |
188 | BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding); | |
189 | ||
190 | ||
191 | /// | |
192 | /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new | |
193 | /// facet. | |
194 | /// | |
195 | /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. | |
196 | /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output. | |
197 | /// | |
198 | /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join | |
199 | /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware | |
200 | /// of wide encoding type | |
201 | /// | |
202 | BOOST_LOCALE_DECL | |
203 | std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type); | |
11fdf7f2 TL |
204 | #endif |
205 | ||
206 | #ifndef BOOST_NO_CXX11_SMART_PTR | |
207 | /// | |
208 | /// This function creates a \a base_converter that can be used for conversion between UTF-8 and | |
209 | /// unicode code points | |
210 | /// | |
211 | BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_utf8_converter_unique_ptr(); | |
212 | /// | |
213 | /// This function creates a \a base_converter that can be used for conversion between single byte | |
214 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, | |
215 | /// | |
216 | /// If \a encoding is not supported, empty pointer is returned. You should check if | |
217 | /// std::unique_ptr<base_converter>::get() != 0 | |
218 | /// | |
219 | BOOST_LOCALE_DECL std::unique_ptr<base_converter> create_simple_converter_unique_ptr(std::string const &encoding); | |
220 | ||
221 | /// | |
222 | /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new | |
223 | /// facet. | |
224 | /// | |
225 | /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. | |
226 | /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output. | |
227 | /// | |
228 | /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join | |
229 | /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware | |
230 | /// of wide encoding type | |
231 | /// | |
232 | BOOST_LOCALE_DECL | |
233 | std::locale create_codecvt(std::locale const &in,std::unique_ptr<base_converter> cvt,character_facet_type type); | |
234 | #endif | |
235 | ||
236 | /// | |
237 | /// This function creates a \a base_converter that can be used for conversion between UTF-8 and | |
238 | /// unicode code points | |
239 | /// | |
240 | BOOST_LOCALE_DECL base_converter *create_utf8_converter_new_ptr(); | |
241 | /// | |
242 | /// This function creates a \a base_converter that can be used for conversion between single byte | |
243 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, | |
244 | /// | |
245 | /// If \a encoding is not supported, empty pointer is returned. You should check if | |
246 | /// std::unique_ptr<base_converter>::get() != 0 | |
247 | /// | |
248 | BOOST_LOCALE_DECL base_converter *create_simple_converter_new_ptr(std::string const &encoding); | |
249 | ||
250 | /// | |
251 | /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new | |
252 | /// facet. | |
253 | /// | |
254 | /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. | |
255 | /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output. | |
256 | /// | |
257 | /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join | |
258 | /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware | |
259 | /// of wide encoding type | |
260 | /// | |
261 | /// ownership of cvt is transfered | |
262 | /// | |
263 | BOOST_LOCALE_DECL | |
264 | std::locale create_codecvt_from_pointer(std::locale const &in,base_converter *cvt,character_facet_type type); | |
7c673cae FG |
265 | |
266 | /// | |
267 | /// Install utf8 codecvt to UTF-16 or UTF-32 into locale \a in and return | |
268 | /// new locale that is based on \a in and uses new facet. | |
269 | /// | |
270 | BOOST_LOCALE_DECL | |
271 | std::locale create_utf8_codecvt(std::locale const &in,character_facet_type type); | |
272 | ||
273 | /// | |
274 | /// This function installs codecvt that can be used for conversion between single byte | |
275 | /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, | |
276 | /// | |
277 | /// Throws boost::locale::conv::invalid_charset_error if the chacater set is not supported or isn't single byte character | |
278 | /// set | |
279 | BOOST_LOCALE_DECL | |
280 | std::locale create_simple_codecvt(std::locale const &in,std::string const &encoding,character_facet_type type); | |
281 | } // util | |
282 | } // locale | |
283 | } // boost | |
284 | ||
285 | #endif | |
286 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 |