]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> |
2 | <html> | |
3 | <!-- | |
4 | == Copyright (c) 2001 Ronald Garcia | |
5 | == | |
6 | == Permission to use, copy, modify, distribute and sell this software | |
7 | == and its documentation for any purpose is hereby granted without fee, | |
8 | == provided that the above copyright notice appears in all copies and | |
9 | == that both that copyright notice and this permission notice appear | |
10 | == in supporting documentation. Ronald Garcia makes no | |
11 | == representations about the suitability of this software for any | |
12 | == purpose. It is provided "as is" without express or implied warranty. | |
13 | --> | |
14 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | |
15 | <link rel="stylesheet" type="text/css" href="../../../boost.css"> | |
16 | <link rel="stylesheet" type="text/css" href="style.css"> | |
17 | <head> | |
18 | <title>UTF-8 Codecvt Facet</title> | |
19 | ||
20 | </head> | |
21 | ||
22 | <body bgcolor="#ffffff" link="#0000ee" text="#000000" | |
23 | vlink="#551a8b" alink="#ff0000"> | |
24 | <img src="../../../boost.png" alt="C++ Boost" | |
25 | width="277" height="86"> <br clear="all"> | |
26 | ||
27 | ||
28 | <a name="sec:utf8-codecvt-facet-class"></a> | |
29 | ||
30 | ||
31 | <h1><code>utf8_codecvt_facet</code></h1> | |
32 | ||
33 | ||
34 | <pre> | |
35 | template< | |
36 | typename InternType = wchar_t, | |
37 | typename ExternType = char | |
38 | > utf8_codecvt_facet | |
39 | </pre> | |
40 | ||
41 | ||
42 | <h2>Rationale</h2> | |
43 | ||
44 | ||
45 | UTF-8 is a method of encoding Unicode text in environments | |
46 | where data is stored as 8-bit characters and some ascii characters | |
47 | are considered special (i.e. Unix filesystem filenames) and tend | |
48 | to appear more commonly than other characters. While | |
49 | UTF-8 is convenient and efficient for storing data on filesystems, | |
50 | it was not meant to be manipulated in memory by | |
51 | applications. While some applications (such as Unix's 'cat') can | |
52 | simply ignore the encoding of data, others should convert | |
53 | from UTF-8 to UCS-4 (the more canonical representation of Unicode) | |
54 | on reading from file, and reversing the process on writing out to | |
55 | file. | |
56 | ||
57 | <p>The C++ Standard IOStreams provides the <tt>std::codecvt</tt> | |
58 | facet to handle specifically these cases. On reading from or | |
59 | writing to a file, the <tt>std::basic_filebuf</tt> can call out to | |
60 | the codecvt facet to convert data representations from external | |
61 | format (ie. UTF-8) to internal format (ie. UCS-4) and | |
62 | vice-versa. <tt>utf8_codecvt_facet</tt> is a specialization of | |
63 | <tt>std::codecvt</tt> specifically designed to handle the case | |
64 | of translating between UTF-8 and UCS-4. | |
65 | ||
66 | ||
67 | <h2>Template Parameters</h2> | |
68 | ||
69 | <table border summary="template parameters"> | |
70 | <tr> | |
71 | <th>Parameter</th><th>Description</th><th>Default</th> | |
72 | </tr> | |
73 | ||
74 | <tr> | |
75 | <td><tt>InternType</tt></td> | |
76 | <td>The internal type used to represent UCS-4 characters.</td> | |
77 | <td><tt>wchar_t</tt></td> | |
78 | </tr> | |
79 | ||
80 | <tr> | |
81 | <td><tt>ExternType</tt></td> | |
82 | <td>The external type used to represent UTF-8 octets.</td> | |
83 | <td><tt>char_t</tt></td> | |
84 | </tr> | |
85 | </table> | |
86 | ||
87 | ||
88 | <h2>Requirements</h2> | |
89 | ||
90 | <tt>utf8_codecvt_facet</tt> defaults to using <tt>char</tt> as | |
91 | its external data type and <tt>wchar_t</tt> as its internal | |
92 | datatype, but on some architectures <tt>wchar_t</tt> is | |
93 | not large enough to hold UCS-4 characters. In order to use | |
94 | another internal type.You must also specialize <tt>std::codecvt</tt> | |
95 | to handle your internal and external types. | |
96 | (<tt>std::codecvt<char,wchar_t,std::mbstate_t></tt> is required to be | |
97 | supplied by any standard-conforming compiler). | |
98 | ||
99 | ||
100 | <h2>Example Use</h2> | |
101 | The following is a simple example of using this facet: | |
102 | ||
103 | <pre> | |
104 | //... | |
105 | // My encoding type | |
106 | typedef wchar_t ucs4_t; | |
107 | ||
108 | std::locale old_locale; | |
109 | std::locale utf8_locale(old_locale,new utf8_codecvt_facet<ucs4_t>); | |
110 | ||
111 | // Set a New global locale | |
112 | std::locale::global(utf8_locale); | |
113 | ||
114 | // Send the UCS-4 data out, converting to UTF-8 | |
115 | { | |
116 | std::wofstream ofs("data.ucd"); | |
117 | ofs.imbue(utf8_locale); | |
118 | std::copy(ucs4_data.begin(),ucs4_data.end(), | |
119 | std::ostream_iterator<ucs4_t,ucs4_t>(ofs)); | |
120 | } | |
121 | ||
122 | // Read the UTF-8 data back in, converting to UCS-4 on the way in | |
123 | std::vector<ucs4_t> from_file; | |
124 | { | |
125 | std::wifstream ifs("data.ucd"); | |
126 | ifs.imbue(utf8_locale); | |
127 | ucs4_t item = 0; | |
128 | while (ifs >> item) from_file.push_back(item); | |
129 | } | |
130 | //... | |
131 | </pre> | |
132 | ||
133 | ||
134 | <h2>History</h2> | |
135 | ||
136 | This code was originally written as an iterator adaptor over | |
137 | containers for use with UTF-8 encoded strings in memory. | |
138 | Dietmar Kuehl suggested that it would be better provided as a | |
139 | codecvt facet. | |
140 | ||
141 | <h2>Resources</h2> | |
142 | ||
143 | <ul> | |
144 | <li> <a href="http://www.unicode.org">Unicode Homepage</a> | |
145 | <li> <a href="http://home.CameloT.de/langer/iostreams.htm">Standard | |
146 | C++ IOStreams and Locales</a> | |
147 | <li> <a href="http://www.research.att.com/~bs/3rd.html">The C++ | |
148 | Programming Language Special Edition, Appendix D.</a> | |
149 | </ul> | |
150 | ||
151 | <br> | |
152 | <hr> | |
153 | <table summary="Copyright information"> | |
154 | <tr valign="top"> | |
155 | <td nowrap>Copyright © 2001</td> | |
156 | <td><a href="http://www.osl.iu.edu/~garcia">Ronald Garcia</a>, | |
157 | Indiana University | |
158 | (<a href="mailto:garcia@cs.indiana.edu">garcia@osl.iu.edu</a>)<br> | |
159 | <a href="http://www.osl.iu.edu/~lums">Andrew Lumsdaine</a>, | |
160 | Indiana University | |
161 | (<a href="mailto:lums@osl.iu.edu">lums@osl.iu.edu</a>)</td> | |
162 | </tr> | |
163 | </table> | |
164 | <p><i>© Copyright <a href="http://www.rrsd.com">Robert Ramey</a> 2002-2004. | |
165 | Distributed under the Boost Software License, Version 1.0. (See | |
166 | accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
167 | </i></p> | |
168 | </body> | |
169 | </html> | |
170 | ||
171 |