]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | ||
9 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 filetype=cpp.doxygen | |
10 | /*! | |
11 | \page charset_handling Character Set Conversions | |
12 | ||
13 | \section codecvt Convenience Interface | |
14 | ||
15 | Boost.Locale provides \ref boost::locale::conv::to_utf() "to_utf", \ref boost::locale::conv::from_utf() "from_utf" and | |
16 | \ref boost::locale::conv::utf_to_utf() "utf_to_utf" functions in | |
17 | the \c boost::locale::conv namespace. They are simple and | |
18 | convenient functions to convert a string to and from | |
19 | UTF-8/16/32 strings and strings using other encodings. | |
20 | ||
21 | For example: | |
22 | ||
23 | \code | |
24 | std::string utf8_string = to_utf<char>(latin1_string,"Latin1"); | |
25 | std::wstring wide_string = to_utf<wchar_t>(latin1_string,"Latin1"); | |
26 | std::string latin1_string = from_utf(wide_string,"Latin1"); | |
27 | std::string utf8_string2 = utf_to_utf<char>(wide_string); | |
28 | \endcode | |
29 | ||
30 | ||
31 | This function may use an explicit encoding name like "Latin1" or "ISO-8859-8", | |
32 | or use std::locale as a parameter to fetch this information from it. | |
33 | It also receives a policy parameter that tells it how to behave if the | |
34 | conversion can't be performed (i.e. an illegal or unsupported character is found). | |
35 | By default this function skips all illegal characters and tries to do the best it | |
36 | can, however, it is possible ask it to throw | |
37 | a \ref boost::locale::conv::conversion_error "conversion_error" exception | |
38 | by passing the \c stop flag to it: | |
39 | ||
40 | \code | |
41 | std::wstring s=to_utf<wchar_t>("\xFF\xFF","UTF-8",stop); | |
42 | // Throws because this string is illegal in UTF-8 | |
43 | \endcode | |
44 | ||
45 | \section codecvt_codecvt std::codecvt facet | |
46 | ||
47 | Boost.Locale provides stream codepage conversion facets based on the \c std::codecvt facet. | |
48 | This allows conversion between wide-character encodings and 8-bit encodings like UTF-8, ISO-8859 or Shift-JIS. | |
49 | ||
50 | Most of compilers provide such facets, but: | |
51 | ||
52 | - Under Windows MSVC does not support UTF-8 encodings at all. | |
53 | - Under Linux the encodings are supported only if the required locales are generated. For example | |
54 | it may be impossible to create a \c he_IL.CP1255 locale even when the \c he_IL locale is available. | |
55 | ||
56 | Thus Boost.Locale provides an option to generate code-page conversion facets for use with | |
57 | Boost.Iostreams filters or \c std::wfstream. For example: | |
58 | ||
59 | \code | |
60 | std::locale loc= generator().generate("he_IL.UTF-8"); | |
61 | std::wofstream file. | |
62 | file.imbue(loc); | |
63 | file.open("hello.txt"); | |
64 | file << L"שלום!" << endl; | |
65 | \endcode | |
66 | ||
67 | Would create a file \c hello.txt encoded as UTF-8 with "שלום!" (shalom) in it. | |
68 | ||
69 | \section codecvt_iostreams_integration Integration with Boost.Iostreams | |
70 | ||
71 | You can use the \c std::codecvt facet directly, but this is quite tricky and | |
72 | requires accurate buffer and error management. | |
73 | ||
74 | You can use the \c boost::iostreams::code_converter class for stream-oriented | |
75 | conversions between the wide-character set and narrow locale character set. | |
76 | ||
77 | This is a sample program that converts wide to narrow characters for an arbitrary | |
78 | stream: | |
79 | ||
80 | \code | |
81 | #include <boost/iostreams/stream.hpp> | |
82 | #include <boost/iostreams/categories.hpp> | |
83 | #include <boost/iostreams/code_converter.hpp> | |
84 | ||
85 | #include <boost/locale.hpp> | |
86 | #include <iostream> | |
87 | ||
88 | namespace io = boost::iostreams; | |
89 | ||
90 | // Device that consumes the converted text, | |
91 | // In our case it just writes to standard output | |
92 | class consumer { | |
93 | public: | |
94 | typedef char char_type; | |
95 | typedef io::sink_tag category; | |
96 | std::streamsize write(const char* s, std::streamsize n) | |
97 | { | |
98 | std::cout.write(s,n); | |
99 | return n; | |
100 | } | |
101 | }; | |
102 | ||
103 | ||
104 | int main() | |
105 | { | |
106 | // the device that converts wide characters | |
107 | // to narrow | |
108 | typedef io::code_converter<consumer> converter_device; | |
109 | // the stream that uses this device | |
110 | typedef io::stream<converter_device> converter_stream; | |
111 | ||
112 | ||
113 | consumer cons; | |
114 | // setup out converter to work | |
115 | // with he_IL.UTF-8 locale | |
116 | converter_device dev; | |
117 | boost::locale::generator gen; | |
118 | dev.imbue(gen("he_IL.UTF-8")); | |
119 | dev.open(cons); | |
120 | converter_stream stream; | |
121 | stream.open(dev); | |
122 | // Now wide characters that are written | |
123 | // to the stream would be given to | |
124 | // our consumer as narrow characters | |
125 | // in UTF-8 encoding | |
126 | stream << L"שלום" << std::flush; | |
127 | } | |
128 | ||
129 | \endcode | |
130 | ||
131 | ||
132 | \section codecvt_limitations Limitations of std::codecvt | |
133 | ||
134 | The Standard does not provide any information about \c std::mbstate_t that could be used to save | |
135 | intermediate code-page conversion states. It leaves the definition up to the compiler implementation, making it | |
136 | impossible to reimplement <tt>std::codecvt<wchar_t,char,mbstate_t></tt> for stateful encodings. | |
137 | Thus, Boost.Locale's \c codecvt facet implementation may be used with stateless encodings like UTF-8, | |
138 | ISO-8859, and Shift-JIS, but not with stateful encodings like UTF-7 or SCSU. | |
139 | ||
140 | \b Recommendation: Prefer the Unicode UTF-8 encoding for \c char based strings and files in your application. | |
141 | ||
142 | \note | |
143 | ||
144 | The implementation of codecvt for single byte encodings like ISO-8859-X and for UTF-8 is very efficent | |
145 | and would allow fast conversion of the content, however its performance may be sub-optimal for | |
146 | double-width encodings like Shift-JIS, due to the stateless problem described above. | |
147 | ||
148 | ||
149 | */ | |
150 | ||
151 |