]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) |
2 | // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). | |
3 | // Distributed under the Boost Software License, Version 1.0. (See accompany- | |
4 | // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
5 | ||
6 | #ifndef BOOST_UTF8_CODECVT_FACET_HPP | |
7 | #define BOOST_UTF8_CODECVT_FACET_HPP | |
8 | ||
9 | // MS compatible compilers support #pragma once | |
10 | #if defined(_MSC_VER) && (_MSC_VER >= 1020) | |
11 | # pragma once | |
12 | #endif | |
13 | ||
14 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 | |
15 | // utf8_codecvt_facet.hpp | |
16 | ||
1e59de90 | 17 | // This header defines class utf8_codecvt_facet, derived from |
7c673cae FG |
18 | // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in |
19 | // files into wchar_t strings in the application. | |
20 | // | |
21 | // The header is NOT STANDALONE, and is not to be included by the USER. | |
22 | // There are at least two libraries which want to use this functionality, and | |
23 | // we want to avoid code duplication. It would be possible to create utf8 | |
24 | // library, but: | |
25 | // - this requires review process first | |
1e59de90 | 26 | // - in the case, when linking the a library which uses utf8 |
7c673cae | 27 | // (say 'program_options'), user should also link to the utf8 library. |
1e59de90 TL |
28 | // This seems inconvenient, and asking a user to link to an unrevieved |
29 | // library is strange. | |
7c673cae FG |
30 | // Until the above points are fixed, a library which wants to use utf8 must: |
31 | // - include this header in one of it's headers or sources | |
32 | // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one | |
33 | // of its sources | |
34 | // - before including either file, the library must define | |
35 | // - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used | |
36 | // - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace | |
37 | // declaration. | |
38 | // - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable' | |
39 | // symbols. | |
40 | // | |
41 | // For example, program_options library might contain: | |
1e59de90 | 42 | // #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character> |
7c673cae FG |
43 | // namespace boost { namespace program_options { |
44 | // #define BOOST_UTF8_END_NAMESPACE }} | |
45 | // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL | |
46 | // #include <boost/detail/utf8_codecvt_facet.ipp> | |
47 | // | |
48 | // Essentially, each library will have its own copy of utf8 code, in | |
1e59de90 | 49 | // different namespaces. |
7c673cae FG |
50 | |
51 | // Note:(Robert Ramey). I have made the following alterations in the original | |
52 | // code. | |
53 | // a) Rendered utf8_codecvt<wchar_t, char> with using templates | |
54 | // b) Move longer functions outside class definition to prevent inlining | |
55 | // and make code smaller | |
56 | // c) added on a derived class to permit translation to/from current | |
57 | // locale to utf8 | |
58 | ||
59 | // See http://www.boost.org for updates, documentation, and revision history. | |
60 | ||
61 | // archives stored as text - note these ar templated on the basic | |
62 | // stream templates to accommodate wide (and other?) kind of characters | |
63 | // | |
64 | // note the fact that on libraries without wide characters, ostream is | |
65 | // is not a specialization of basic_ostream which in fact is not defined | |
66 | // in such cases. So we can't use basic_ostream<OStream::char_type> but rather | |
67 | // use two template parameters | |
68 | // | |
69 | // utf8_codecvt_facet | |
1e59de90 | 70 | // This is an implementation of a std::codecvt facet for translating |
7c673cae FG |
71 | // from UTF-8 externally to UCS-4. Note that this is not tied to |
72 | // any specific types in order to allow customization on platforms | |
73 | // where wchar_t is not big enough. | |
74 | // | |
75 | // NOTES: The current implementation jumps through some unpleasant hoops in | |
76 | // order to deal with signed character types. As a std::codecvt_base::result, | |
77 | // it is necessary for the ExternType to be convertible to unsigned char. | |
78 | // I chose not to tie the extern_type explicitly to char. But if any combination | |
79 | // of types other than <wchar_t,char_t> is used, then std::codecvt must be | |
80 | // specialized on those types for this to work. | |
81 | ||
82 | #include <locale> | |
83 | #include <cwchar> // for mbstate_t | |
84 | #include <cstddef> // for std::size_t | |
85 | ||
86 | #include <boost/config.hpp> | |
87 | #include <boost/detail/workaround.hpp> | |
88 | ||
89 | #if defined(BOOST_NO_STDC_NAMESPACE) | |
90 | namespace std { | |
91 | using ::mbstate_t; | |
92 | using ::size_t; | |
93 | } | |
94 | #endif | |
95 | ||
96 | // maximum lenght of a multibyte string | |
97 | #define MB_LENGTH_MAX 8 | |
98 | ||
99 | BOOST_UTF8_BEGIN_NAMESPACE | |
100 | ||
101 | //----------------------------------------------------------------------------// | |
102 | // // | |
103 | // utf8_codecvt_facet // | |
104 | // // | |
105 | // See utf8_codecvt_facet.ipp for the implementation. // | |
106 | //----------------------------------------------------------------------------// | |
107 | ||
108 | #ifndef BOOST_UTF8_DECL | |
109 | #define BOOST_UTF8_DECL | |
110 | #endif | |
111 | ||
1e59de90 TL |
112 | struct BOOST_SYMBOL_VISIBLE utf8_codecvt_facet : |
113 | public std::codecvt<wchar_t, char, std::mbstate_t> | |
7c673cae FG |
114 | { |
115 | public: | |
1e59de90 TL |
116 | BOOST_UTF8_DECL explicit utf8_codecvt_facet(std::size_t no_locale_manage = 0); |
117 | BOOST_UTF8_DECL virtual ~utf8_codecvt_facet(); | |
118 | ||
7c673cae | 119 | protected: |
1e59de90 TL |
120 | BOOST_UTF8_DECL virtual std::codecvt_base::result do_in( |
121 | std::mbstate_t& state, | |
7c673cae | 122 | const char * from, |
1e59de90 | 123 | const char * from_end, |
7c673cae | 124 | const char * & from_next, |
1e59de90 TL |
125 | wchar_t * to, |
126 | wchar_t * to_end, | |
127 | wchar_t * & to_next | |
7c673cae FG |
128 | ) const; |
129 | ||
1e59de90 | 130 | BOOST_UTF8_DECL virtual std::codecvt_base::result do_out( |
7c673cae FG |
131 | std::mbstate_t & state, |
132 | const wchar_t * from, | |
133 | const wchar_t * from_end, | |
1e59de90 | 134 | const wchar_t * & from_next, |
7c673cae FG |
135 | char * to, |
136 | char * to_end, | |
137 | char * & to_next | |
138 | ) const; | |
139 | ||
140 | bool invalid_continuing_octet(unsigned char octet_1) const { | |
141 | return (octet_1 < 0x80|| 0xbf< octet_1); | |
142 | } | |
143 | ||
1e59de90 | 144 | bool invalid_leading_octet(unsigned char octet_1) const { |
7c673cae FG |
145 | return (0x7f < octet_1 && octet_1 < 0xc0) || |
146 | (octet_1 > 0xfd); | |
147 | } | |
148 | ||
149 | // continuing octets = octets except for the leading octet | |
150 | static unsigned int get_cont_octet_count(unsigned char lead_octet) { | |
151 | return get_octet_count(lead_octet) - 1; | |
152 | } | |
153 | ||
1e59de90 | 154 | BOOST_UTF8_DECL static unsigned int get_octet_count(unsigned char lead_octet); |
7c673cae FG |
155 | |
156 | // How many "continuing octets" will be needed for this word | |
157 | // == total octets - 1. | |
1e59de90 | 158 | BOOST_UTF8_DECL static int get_cont_octet_out_count(wchar_t word); |
7c673cae FG |
159 | |
160 | virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW { | |
161 | return false; | |
162 | } | |
163 | ||
164 | // UTF-8 isn't really stateful since we rewind on partial conversions | |
165 | virtual std::codecvt_base::result do_unshift( | |
1e59de90 | 166 | std::mbstate_t &, |
7c673cae FG |
167 | char * from, |
168 | char * /*to*/, | |
169 | char * & next | |
170 | ) const { | |
171 | next = from; | |
172 | return ok; | |
173 | } | |
174 | ||
175 | virtual int do_encoding() const BOOST_NOEXCEPT_OR_NOTHROW { | |
176 | const int variable_byte_external_encoding=0; | |
177 | return variable_byte_external_encoding; | |
178 | } | |
179 | ||
180 | // How many char objects can I process to get <= max_limit | |
181 | // wchar_t objects? | |
1e59de90 | 182 | BOOST_UTF8_DECL virtual int do_length( |
b32b8144 | 183 | std::mbstate_t &, |
7c673cae | 184 | const char * from, |
1e59de90 | 185 | const char * from_end, |
7c673cae FG |
186 | std::size_t max_limit |
187 | ) const | |
188 | #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) | |
189 | throw() | |
190 | #endif | |
191 | ; | |
b32b8144 FG |
192 | |
193 | // Nonstandard override | |
7c673cae | 194 | virtual int do_length( |
b32b8144 | 195 | const std::mbstate_t & s, |
7c673cae | 196 | const char * from, |
1e59de90 | 197 | const char * from_end, |
7c673cae FG |
198 | std::size_t max_limit |
199 | ) const | |
200 | #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) | |
201 | throw() | |
202 | #endif | |
203 | { | |
204 | return do_length( | |
b32b8144 | 205 | const_cast<std::mbstate_t &>(s), |
7c673cae FG |
206 | from, |
207 | from_end, | |
208 | max_limit | |
209 | ); | |
210 | } | |
b32b8144 | 211 | |
7c673cae FG |
212 | // Largest possible value do_length(state,from,from_end,1) could return. |
213 | virtual int do_max_length() const BOOST_NOEXCEPT_OR_NOTHROW { | |
214 | return 6; // largest UTF-8 encoding of a UCS-4 character | |
215 | } | |
216 | }; | |
217 | ||
218 | BOOST_UTF8_END_NAMESPACE | |
219 | ||
220 | #endif // BOOST_UTF8_CODECVT_FACET_HPP |