]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | [/ |
2 | Copyright 2006-2007 John Maddock. | |
3 | Distributed under the Boost Software License, Version 1.0. | |
4 | (See accompanying file LICENSE_1_0.txt or copy at | |
5 | http://www.boost.org/LICENSE_1_0.txt). | |
6 | ] | |
7 | ||
8 | ||
9 | [section:icu Working With Unicode and ICU String Types] | |
10 | ||
11 | [section:intro Introduction to using Regex with ICU] | |
12 | ||
13 | The header: | |
14 | ||
15 | <boost/regex/icu.hpp> | |
16 | ||
17 | contains the data types and algorithms necessary for working with regular | |
18 | expressions in a Unicode aware environment. | |
19 | ||
20 | In order to use this header you will need the | |
21 | [@http://www.ibm.com/software/globalization/icu/ ICU library], and you will need | |
22 | to have built the Boost.Regex library with | |
23 | [link boost_regex.install.building_with_unicode_and_icu_su ICU support enabled]. | |
24 | ||
25 | The header will enable you to: | |
26 | ||
27 | * Create regular expressions that treat Unicode strings as sequences of UTF-32 code points. | |
28 | * Create regular expressions that support various Unicode data properties, including character classification. | |
29 | * Transparently search Unicode strings that are encoded as either UTF-8, UTF-16 or UTF-32. | |
30 | ||
31 | [endsect] | |
32 | ||
33 | [section:unicode_types Unicode regular expression types] | |
34 | ||
35 | Header `<boost/regex/icu.hpp>` provides a regular expression traits class that | |
36 | handles UTF-32 characters: | |
37 | ||
38 | class icu_regex_traits; | |
39 | ||
40 | and a regular expression type based upon that: | |
41 | ||
42 | typedef basic_regex<UChar32,icu_regex_traits> u32regex; | |
43 | ||
44 | The type `u32regex` is regular expression type to use for all Unicode | |
45 | regular expressions; internally it uses UTF-32 code points, but can be | |
46 | created from, and used to search, either UTF-8, or UTF-16 encoded strings | |
47 | as well as UTF-32 ones. | |
48 | ||
49 | The constructors, and assign member functions of `u32regex`, require UTF-32 | |
50 | encoded strings, but there are a series of overloaded algorithms called | |
51 | `make_u32regex` which allow regular expressions to be created from | |
52 | UTF-8, UTF-16, or UTF-32 encoded strings: | |
53 | ||
54 | template <class InputIterator> | |
55 | u32regex make_u32regex(InputIterator i, | |
56 | InputIterator j, | |
57 | boost::regex_constants::syntax_option_type opt); | |
58 | ||
59 | [*Effects]: Creates a regular expression object from the iterator sequence \[i,j). | |
60 | The character encoding of the sequence is determined based upon sizeof(*i): | |
61 | 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. | |
62 | ||
63 | u32regex make_u32regex(const char* p, | |
64 | boost::regex_constants::syntax_option_type opt | |
65 | = boost::regex_constants::perl); | |
66 | ||
67 | [*Effects]: Creates a regular expression object from the Null-terminated | |
68 | UTF-8 character sequence /p/. | |
69 | ||
70 | u32regex make_u32regex(const unsigned char* p, | |
71 | boost::regex_constants::syntax_option_type opt | |
72 | = boost::regex_constants::perl); | |
73 | ||
74 | [*Effects]: Creates a regular expression object from the Null-terminated UTF-8 character sequence p. | |
75 | ||
76 | u32regex make_u32regex(const wchar_t* p, | |
77 | boost::regex_constants::syntax_option_type opt | |
78 | = boost::regex_constants::perl); | |
79 | ||
80 | [*Effects]: Creates a regular expression object from the Null-terminated character sequence p. The character encoding of the sequence is determined based upon sizeof(wchar_t): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. | |
81 | ||
82 | u32regex make_u32regex(const UChar* p, | |
83 | boost::regex_constants::syntax_option_type opt | |
84 | = boost::regex_constants::perl); | |
85 | ||
86 | [*Effects]: Creates a regular expression object from the Null-terminated UTF-16 character sequence p. | |
87 | ||
88 | template<class C, class T, class A> | |
89 | u32regex make_u32regex(const std::basic_string<C, T, A>& s, | |
90 | boost::regex_constants::syntax_option_type opt | |
91 | = boost::regex_constants::perl); | |
92 | ||
93 | [*Effects]: Creates a regular expression object from the string s. The character encoding of the string is determined based upon sizeof(C): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32. | |
94 | ||
95 | u32regex make_u32regex(const UnicodeString& s, | |
96 | boost::regex_constants::syntax_option_type opt | |
97 | = boost::regex_constants::perl); | |
98 | ||
99 | [*Effects]: Creates a regular expression object from the UTF-16 encoding string s. | |
100 | ||
101 | [endsect] | |
102 | ||
103 | [section:unicode_algo Unicode Regular Expression Algorithms] | |
104 | ||
105 | The regular expression algorithms [regex_match], [regex_search] and [regex_replace] | |
106 | all expect that the character sequence upon which they operate, | |
107 | is encoded in the same character encoding as the regular expression object | |
108 | with which they are used. For Unicode regular expressions that behavior is | |
109 | undesirable: while we may want to process the data in UTF-32 "chunks", the | |
110 | actual data is much more likely to encoded as either UTF-8 or UTF-16. | |
111 | Therefore the header <boost/regex/icu.hpp> provides a series of thin wrappers | |
112 | around these algorithms, called `u32regex_match`, `u32regex_search`, and | |
113 | `u32regex_replace`. These wrappers use iterator-adapters internally to | |
114 | make external UTF-8 or UTF-16 data look as though it's really a UTF-32 sequence, | |
115 | that can then be passed on to the "real" algorithm. | |
116 | ||
117 | [h4 u32regex_match] | |
118 | ||
119 | For each [regex_match] algorithm defined by `<boost/regex.hpp>`, then | |
120 | `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the | |
121 | same arguments, but which is called `u32regex_match`, and which will | |
122 | accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an | |
123 | ICU UnicodeString as input. | |
124 | ||
125 | Example: match a password, encoded in a UTF-16 UnicodeString: | |
126 | ||
127 | // | |
128 | // Find out if *password* meets our password requirements, | |
129 | // as defined by the regular expression *requirements*. | |
130 | // | |
131 | bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements) | |
132 | { | |
133 | return boost::u32regex_match(password, boost::make_u32regex(requirements)); | |
134 | } | |
135 | ||
136 | Example: match a UTF-8 encoded filename: | |
137 | ||
138 | // | |
139 | // Extract filename part of a path from a UTF-8 encoded std::string and return the result | |
140 | // as another std::string: | |
141 | // | |
142 | std::string get_filename(const std::string& path) | |
143 | { | |
144 | boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)"); | |
145 | boost::smatch what; | |
146 | if(boost::u32regex_match(path, what, r)) | |
147 | { | |
148 | // extract $1 as a std::string: | |
149 | return what.str(1); | |
150 | } | |
151 | else | |
152 | { | |
153 | throw std::runtime_error("Invalid pathname"); | |
154 | } | |
155 | } | |
156 | ||
157 | [h4 u32regex_search] | |
158 | ||
159 | For each [regex_search] algorithm defined by `<boost/regex.hpp>`, then | |
160 | `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the | |
161 | same arguments, but which is called `u32regex_search`, and which will | |
162 | accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU | |
163 | UnicodeString as input. | |
164 | ||
165 | Example: search for a character sequence in a specific language block: | |
166 | ||
167 | UnicodeString extract_greek(const UnicodeString& text) | |
168 | { | |
169 | // searches through some UTF-16 encoded text for a block encoded in Greek, | |
170 | // this expression is imperfect, but the best we can do for now - searching | |
171 | // for specific scripts is actually pretty hard to do right. | |
172 | // | |
173 | // Here we search for a character sequence that begins with a Greek letter, | |
174 | // and continues with characters that are either not-letters ( [^[:L*:]] ) | |
175 | // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ). | |
176 | // | |
177 | boost::u32regex r = boost::make_u32regex( | |
178 | L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*"); | |
179 | boost::u16match what; | |
180 | if(boost::u32regex_search(text, what, r)) | |
181 | { | |
182 | // extract $0 as a UnicodeString: | |
183 | return UnicodeString(what[0].first, what.length(0)); | |
184 | } | |
185 | else | |
186 | { | |
187 | throw std::runtime_error("No Greek found!"); | |
188 | } | |
189 | } | |
190 | ||
191 | [h4 u32regex_replace] | |
192 | ||
193 | For each [regex_replace] algorithm defined by `<boost/regex.hpp>`, then | |
194 | `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes | |
195 | the same arguments, but which is called `u32regex_replace`, and which will | |
196 | accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU | |
197 | UnicodeString as input. The input sequence and the format string specifier | |
198 | passed to the algorithm, can be encoded independently (for example one can | |
199 | be UTF-8, the other in UTF-16), but the result string / output iterator | |
200 | argument must use the same character encoding as the text being searched. | |
201 | ||
202 | Example: Credit card number reformatting: | |
203 | ||
204 | // | |
205 | // Take a credit card number as a string of digits, | |
206 | // and reformat it as a human readable string with "-" | |
207 | // separating each group of four digit;, | |
208 | // note that we're mixing a UTF-32 regex, with a UTF-16 | |
209 | // string and a UTF-8 format specifier, and it still all | |
210 | // just works: | |
211 | // | |
212 | const boost::u32regex e = boost::make_u32regex( | |
213 | "\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z"); | |
214 | const char* human_format = "$1-$2-$3-$4"; | |
215 | ||
216 | UnicodeString human_readable_card_number(const UnicodeString& s) | |
217 | { | |
218 | return boost::u32regex_replace(s, e, human_format); | |
219 | } | |
220 | ||
221 | [endsect] | |
222 | [section:unicode_iter Unicode Aware Regex Iterators] | |
223 | ||
224 | [h4 u32regex_iterator] | |
225 | ||
226 | Type `u32regex_iterator` is in all respects the same as [regex_iterator] | |
227 | except that since the regular expression type is always `u32regex` | |
228 | it only takes one template parameter (the iterator type). It also calls | |
229 | `u32regex_search` internally, allowing it to interface correctly with | |
230 | UTF-8, UTF-16, and UTF-32 data: | |
231 | ||
232 | template <class BidirectionalIterator> | |
233 | class u32regex_iterator | |
234 | { | |
235 | // for members see regex_iterator | |
236 | }; | |
237 | ||
238 | typedef u32regex_iterator<const char*> utf8regex_iterator; | |
239 | typedef u32regex_iterator<const UChar*> utf16regex_iterator; | |
240 | typedef u32regex_iterator<const UChar32*> utf32regex_iterator; | |
241 | ||
242 | In order to simplify the construction of a `u32regex_iterator` from a string, | |
243 | there are a series of non-member helper functions called make_u32regex_iterator: | |
244 | ||
245 | u32regex_iterator<const char*> | |
246 | make_u32regex_iterator(const char* s, | |
247 | const u32regex& e, | |
248 | regex_constants::match_flag_type m = regex_constants::match_default); | |
249 | ||
250 | u32regex_iterator<const wchar_t*> | |
251 | make_u32regex_iterator(const wchar_t* s, | |
252 | const u32regex& e, | |
253 | regex_constants::match_flag_type m = regex_constants::match_default); | |
254 | ||
255 | u32regex_iterator<const UChar*> | |
256 | make_u32regex_iterator(const UChar* s, | |
257 | const u32regex& e, | |
258 | regex_constants::match_flag_type m = regex_constants::match_default); | |
259 | ||
260 | template <class charT, class Traits, class Alloc> | |
261 | u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> | |
262 | make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s, | |
263 | const u32regex& e, | |
264 | regex_constants::match_flag_type m = regex_constants::match_default); | |
265 | ||
266 | u32regex_iterator<const UChar*> | |
267 | make_u32regex_iterator(const UnicodeString& s, | |
268 | const u32regex& e, | |
269 | regex_constants::match_flag_type m = regex_constants::match_default); | |
270 | ||
271 | Each of these overloads returns an iterator that enumerates all occurrences | |
272 | of expression /e/, in text /s/, using match_flags /m/. | |
273 | ||
274 | Example: search for international currency symbols, along with their associated numeric value: | |
275 | ||
276 | void enumerate_currencies(const std::string& text) | |
277 | { | |
278 | // enumerate and print all the currency symbols, along | |
279 | // with any associated numeric values: | |
280 | const char* re = | |
281 | "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" | |
282 | "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" | |
283 | "(?(1)" | |
284 | "|(?(2)" | |
285 | "[[:Cf:][:Cc:][:Z*:]]*" | |
286 | ")" | |
287 | "[[:Sc:]]" | |
288 | ")"; | |
289 | boost::u32regex r = boost::make_u32regex(re); | |
290 | boost::u32regex_iterator<std::string::const_iterator> | |
291 | i(boost::make_u32regex_iterator(text, r)), j; | |
292 | while(i != j) | |
293 | { | |
294 | std::cout << (*i)[0] << std::endl; | |
295 | ++i; | |
296 | } | |
297 | } | |
298 | ||
299 | Calling | |
300 | ||
301 | [/this doesn't format correctly as code:] | |
302 | [pre enumerate_currencies(" $100.23 or '''£'''198.12 ");] | |
303 | ||
304 | Yields the output: | |
305 | ||
306 | [pre | |
307 | $100.23 | |
308 | '''£'''198.12 | |
309 | ] | |
310 | ||
311 | Provided of course that the input is encoded as UTF-8. | |
312 | ||
313 | [h4 u32regex_token_iterator] | |
314 | ||
315 | Type `u32regex_token_iterator` is in all respects the same as [regex_token_iterator] | |
316 | except that since the regular expression type is always `u32regex` it only | |
317 | takes one template parameter (the iterator type). It also calls | |
318 | `u32regex_search` internally, allowing it to interface correctly with UTF-8, | |
319 | UTF-16, and UTF-32 data: | |
320 | ||
321 | template <class BidirectionalIterator> | |
322 | class u32regex_token_iterator | |
323 | { | |
324 | // for members see regex_token_iterator | |
325 | }; | |
326 | ||
327 | typedef u32regex_token_iterator<const char*> utf8regex_token_iterator; | |
328 | typedef u32regex_token_iterator<const UChar*> utf16regex_token_iterator; | |
329 | typedef u32regex_token_iterator<const UChar32*> utf32regex_token_iterator; | |
330 | ||
331 | In order to simplify the construction of a `u32regex_token_iterator` from a string, | |
332 | there are a series of non-member helper functions called `make_u32regex_token_iterator`: | |
333 | ||
334 | u32regex_token_iterator<const char*> | |
335 | make_u32regex_token_iterator( | |
336 | const char* s, | |
337 | const u32regex& e, | |
338 | int sub, | |
339 | regex_constants::match_flag_type m = regex_constants::match_default); | |
340 | ||
341 | u32regex_token_iterator<const wchar_t*> | |
342 | make_u32regex_token_iterator( | |
343 | const wchar_t* s, | |
344 | const u32regex& e, | |
345 | int sub, | |
346 | regex_constants::match_flag_type m = regex_constants::match_default); | |
347 | ||
348 | u32regex_token_iterator<const UChar*> | |
349 | make_u32regex_token_iterator( | |
350 | const UChar* s, | |
351 | const u32regex& e, | |
352 | int sub, | |
353 | regex_constants::match_flag_type m = regex_constants::match_default); | |
354 | ||
355 | template <class charT, class Traits, class Alloc> | |
356 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> | |
357 | make_u32regex_token_iterator( | |
358 | const std::basic_string<charT, Traits, Alloc>& s, | |
359 | const u32regex& e, | |
360 | int sub, | |
361 | regex_constants::match_flag_type m = regex_constants::match_default); | |
362 | ||
363 | u32regex_token_iterator<const UChar*> | |
364 | make_u32regex_token_iterator( | |
365 | const UnicodeString& s, | |
366 | const u32regex& e, | |
367 | int sub, | |
368 | regex_constants::match_flag_type m = regex_constants::match_default); | |
369 | ||
370 | Each of these overloads returns an iterator that enumerates all occurrences of | |
371 | marked sub-expression sub in regular expression /e/, found in text /s/, using | |
372 | match_flags /m/. | |
373 | ||
374 | template <std::size_t N> | |
375 | u32regex_token_iterator<const char*> | |
376 | make_u32regex_token_iterator( | |
377 | const char* p, | |
378 | const u32regex& e, | |
379 | const int (&submatch)[N], | |
380 | regex_constants::match_flag_type m = regex_constants::match_default); | |
381 | ||
382 | template <std::size_t N> | |
383 | u32regex_token_iterator<const wchar_t*> | |
384 | make_u32regex_token_iterator( | |
385 | const wchar_t* p, | |
386 | const u32regex& e, | |
387 | const int (&submatch)[N], | |
388 | regex_constants::match_flag_type m = regex_constants::match_default); | |
389 | ||
390 | template <std::size_t N> | |
391 | u32regex_token_iterator<const UChar*> | |
392 | make_u32regex_token_iterator( | |
393 | const UChar* p, | |
394 | const u32regex& e, | |
395 | const int (&submatch)[N], | |
396 | regex_constants::match_flag_type m = regex_constants::match_default); | |
397 | ||
398 | template <class charT, class Traits, class Alloc, std::size_t N> | |
399 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> | |
400 | make_u32regex_token_iterator( | |
401 | const std::basic_string<charT, Traits, Alloc>& p, | |
402 | const u32regex& e, | |
403 | const int (&submatch)[N], | |
404 | regex_constants::match_flag_type m = regex_constants::match_default); | |
405 | ||
406 | template <std::size_t N> | |
407 | u32regex_token_iterator<const UChar*> | |
408 | make_u32regex_token_iterator( | |
409 | const UnicodeString& s, | |
410 | const u32regex& e, | |
411 | const int (&submatch)[N], | |
412 | regex_constants::match_flag_type m = regex_constants::match_default); | |
413 | ||
414 | Each of these overloads returns an iterator that enumerates one sub-expression | |
415 | for each submatch in regular expression /e/, found in text /s/, using match_flags /m/. | |
416 | ||
417 | u32regex_token_iterator<const char*> | |
418 | make_u32regex_token_iterator( | |
419 | const char* p, | |
420 | const u32regex& e, | |
421 | const std::vector<int>& submatch, | |
422 | regex_constants::match_flag_type m = regex_constants::match_default); | |
423 | ||
424 | u32regex_token_iterator<const wchar_t*> | |
425 | make_u32regex_token_iterator( | |
426 | const wchar_t* p, | |
427 | const u32regex& e, | |
428 | const std::vector<int>& submatch, | |
429 | regex_constants::match_flag_type m = regex_constants::match_default); | |
430 | ||
431 | u32regex_token_iterator<const UChar*> | |
432 | make_u32regex_token_iterator( | |
433 | const UChar* p, | |
434 | const u32regex& e, | |
435 | const std::vector<int>& submatch, | |
436 | regex_constants::match_flag_type m = regex_constants::match_default); | |
437 | ||
438 | template <class charT, class Traits, class Alloc> | |
439 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator> | |
440 | make_u32regex_token_iterator( | |
441 | const std::basic_string<charT, Traits, Alloc>& p, | |
442 | const u32regex& e, | |
443 | const std::vector<int>& submatch, | |
444 | regex_constants::match_flag_type m = regex_constants::match_default); | |
445 | ||
446 | u32regex_token_iterator<const UChar*> | |
447 | make_u32regex_token_iterator( | |
448 | const UnicodeString& s, | |
449 | const u32regex& e, | |
450 | const std::vector<int>& submatch, | |
451 | regex_constants::match_flag_type m = regex_constants::match_default); | |
452 | ||
453 | Each of these overloads returns an iterator that enumerates one sub-expression for | |
454 | each submatch in regular expression /e/, found in text /s/, using match_flags /m/. | |
455 | ||
456 | Example: search for international currency symbols, along with their associated numeric value: | |
457 | ||
458 | void enumerate_currencies2(const std::string& text) | |
459 | { | |
460 | // enumerate and print all the currency symbols, along | |
461 | // with any associated numeric values: | |
462 | const char* re = | |
463 | "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" | |
464 | "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" | |
465 | "(?(1)" | |
466 | "|(?(2)" | |
467 | "[[:Cf:][:Cc:][:Z*:]]*" | |
468 | ")" | |
469 | "[[:Sc:]]" | |
470 | ")"; | |
471 | boost::u32regex r = boost::make_u32regex(re); | |
472 | boost::u32regex_token_iterator<std::string::const_iterator> | |
473 | i(boost::make_u32regex_token_iterator(text, r, 1)), j; | |
474 | while(i != j) | |
475 | { | |
476 | std::cout << *i << std::endl; | |
477 | ++i; | |
478 | } | |
479 | } | |
480 | ||
481 | [endsect] | |
482 | ||
483 | [endsect] | |
484 |