]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/regex/doc/icu_strings.qbk
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / regex / doc / icu_strings.qbk
1 [/
2 Copyright 2006-2007 John Maddock.
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt).
6 ]
7
8
9 [section:icu Working With Unicode and ICU String Types]
10
11 [section:intro Introduction to using Regex with ICU]
12
13 The header:
14
15 <boost/regex/icu.hpp>
16
17 contains the data types and algorithms necessary for working with regular
18 expressions in a Unicode aware environment.
19
20 In order to use this header you will need the
21 [@http://www.ibm.com/software/globalization/icu/ ICU library], and you will need
22 to have built the Boost.Regex library with
23 [link boost_regex.install.building_with_unicode_and_icu_su ICU support enabled].
24
25 The header will enable you to:
26
27 * Create regular expressions that treat Unicode strings as sequences of UTF-32 code points.
28 * Create regular expressions that support various Unicode data properties, including character classification.
29 * Transparently search Unicode strings that are encoded as either UTF-8, UTF-16 or UTF-32.
30
31 [endsect]
32
33 [section:unicode_types Unicode regular expression types]
34
35 Header `<boost/regex/icu.hpp>` provides a regular expression traits class that
36 handles UTF-32 characters:
37
38 class icu_regex_traits;
39
40 and a regular expression type based upon that:
41
42 typedef basic_regex<UChar32,icu_regex_traits> u32regex;
43
44 The type `u32regex` is regular expression type to use for all Unicode
45 regular expressions; internally it uses UTF-32 code points, but can be
46 created from, and used to search, either UTF-8, or UTF-16 encoded strings
47 as well as UTF-32 ones.
48
49 The constructors, and assign member functions of `u32regex`, require UTF-32
50 encoded strings, but there are a series of overloaded algorithms called
51 `make_u32regex` which allow regular expressions to be created from
52 UTF-8, UTF-16, or UTF-32 encoded strings:
53
54 template <class InputIterator>
55 u32regex make_u32regex(InputIterator i,
56 InputIterator j,
57 boost::regex_constants::syntax_option_type opt);
58
59 [*Effects]: Creates a regular expression object from the iterator sequence \[i,j).
60 The character encoding of the sequence is determined based upon sizeof(*i):
61 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
62
63 u32regex make_u32regex(const char* p,
64 boost::regex_constants::syntax_option_type opt
65 = boost::regex_constants::perl);
66
67 [*Effects]: Creates a regular expression object from the Null-terminated
68 UTF-8 character sequence /p/.
69
70 u32regex make_u32regex(const unsigned char* p,
71 boost::regex_constants::syntax_option_type opt
72 = boost::regex_constants::perl);
73
74 [*Effects]: Creates a regular expression object from the Null-terminated UTF-8 character sequence p.
75
76 u32regex make_u32regex(const wchar_t* p,
77 boost::regex_constants::syntax_option_type opt
78 = boost::regex_constants::perl);
79
80 [*Effects]: Creates a regular expression object from the Null-terminated character sequence p. The character encoding of the sequence is determined based upon sizeof(wchar_t): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
81
82 u32regex make_u32regex(const UChar* p,
83 boost::regex_constants::syntax_option_type opt
84 = boost::regex_constants::perl);
85
86 [*Effects]: Creates a regular expression object from the Null-terminated UTF-16 character sequence p.
87
88 template<class C, class T, class A>
89 u32regex make_u32regex(const std::basic_string<C, T, A>& s,
90 boost::regex_constants::syntax_option_type opt
91 = boost::regex_constants::perl);
92
93 [*Effects]: Creates a regular expression object from the string s. The character encoding of the string is determined based upon sizeof(C): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
94
95 u32regex make_u32regex(const UnicodeString& s,
96 boost::regex_constants::syntax_option_type opt
97 = boost::regex_constants::perl);
98
99 [*Effects]: Creates a regular expression object from the UTF-16 encoding string s.
100
101 [endsect]
102
103 [section:unicode_algo Unicode Regular Expression Algorithms]
104
105 The regular expression algorithms [regex_match], [regex_search] and [regex_replace]
106 all expect that the character sequence upon which they operate,
107 is encoded in the same character encoding as the regular expression object
108 with which they are used. For Unicode regular expressions that behavior is
109 undesirable: while we may want to process the data in UTF-32 "chunks", the
110 actual data is much more likely to encoded as either UTF-8 or UTF-16.
111 Therefore the header <boost/regex/icu.hpp> provides a series of thin wrappers
112 around these algorithms, called `u32regex_match`, `u32regex_search`, and
113 `u32regex_replace`. These wrappers use iterator-adapters internally to
114 make external UTF-8 or UTF-16 data look as though it's really a UTF-32 sequence,
115 that can then be passed on to the "real" algorithm.
116
117 [h4 u32regex_match]
118
119 For each [regex_match] algorithm defined by `<boost/regex.hpp>`, then
120 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
121 same arguments, but which is called `u32regex_match`, and which will
122 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
123 ICU UnicodeString as input.
124
125 Example: match a password, encoded in a UTF-16 UnicodeString:
126
127 //
128 // Find out if *password* meets our password requirements,
129 // as defined by the regular expression *requirements*.
130 //
131 bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements)
132 {
133 return boost::u32regex_match(password, boost::make_u32regex(requirements));
134 }
135
136 Example: match a UTF-8 encoded filename:
137
138 //
139 // Extract filename part of a path from a UTF-8 encoded std::string and return the result
140 // as another std::string:
141 //
142 std::string get_filename(const std::string& path)
143 {
144 boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)");
145 boost::smatch what;
146 if(boost::u32regex_match(path, what, r))
147 {
148 // extract $1 as a std::string:
149 return what.str(1);
150 }
151 else
152 {
153 throw std::runtime_error("Invalid pathname");
154 }
155 }
156
157 [h4 u32regex_search]
158
159 For each [regex_search] algorithm defined by `<boost/regex.hpp>`, then
160 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
161 same arguments, but which is called `u32regex_search`, and which will
162 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
163 UnicodeString as input.
164
165 Example: search for a character sequence in a specific language block:
166
167 UnicodeString extract_greek(const UnicodeString& text)
168 {
169 // searches through some UTF-16 encoded text for a block encoded in Greek,
170 // this expression is imperfect, but the best we can do for now - searching
171 // for specific scripts is actually pretty hard to do right.
172 //
173 // Here we search for a character sequence that begins with a Greek letter,
174 // and continues with characters that are either not-letters ( [^[:L*:]] )
175 // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ).
176 //
177 boost::u32regex r = boost::make_u32regex(
178 L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*");
179 boost::u16match what;
180 if(boost::u32regex_search(text, what, r))
181 {
182 // extract $0 as a UnicodeString:
183 return UnicodeString(what[0].first, what.length(0));
184 }
185 else
186 {
187 throw std::runtime_error("No Greek found!");
188 }
189 }
190
191 [h4 u32regex_replace]
192
193 For each [regex_replace] algorithm defined by `<boost/regex.hpp>`, then
194 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes
195 the same arguments, but which is called `u32regex_replace`, and which will
196 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
197 UnicodeString as input. The input sequence and the format string specifier
198 passed to the algorithm, can be encoded independently (for example one can
199 be UTF-8, the other in UTF-16), but the result string / output iterator
200 argument must use the same character encoding as the text being searched.
201
202 Example: Credit card number reformatting:
203
204 //
205 // Take a credit card number as a string of digits,
206 // and reformat it as a human readable string with "-"
207 // separating each group of four digit;,
208 // note that we're mixing a UTF-32 regex, with a UTF-16
209 // string and a UTF-8 format specifier, and it still all
210 // just works:
211 //
212 const boost::u32regex e = boost::make_u32regex(
213 "\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z");
214 const char* human_format = "$1-$2-$3-$4";
215
216 UnicodeString human_readable_card_number(const UnicodeString& s)
217 {
218 return boost::u32regex_replace(s, e, human_format);
219 }
220
221 [endsect]
222 [section:unicode_iter Unicode Aware Regex Iterators]
223
224 [h4 u32regex_iterator]
225
226 Type `u32regex_iterator` is in all respects the same as [regex_iterator]
227 except that since the regular expression type is always `u32regex`
228 it only takes one template parameter (the iterator type). It also calls
229 `u32regex_search` internally, allowing it to interface correctly with
230 UTF-8, UTF-16, and UTF-32 data:
231
232 template <class BidirectionalIterator>
233 class u32regex_iterator
234 {
235 // for members see regex_iterator
236 };
237
238 typedef u32regex_iterator<const char*> utf8regex_iterator;
239 typedef u32regex_iterator<const UChar*> utf16regex_iterator;
240 typedef u32regex_iterator<const UChar32*> utf32regex_iterator;
241
242 In order to simplify the construction of a `u32regex_iterator` from a string,
243 there are a series of non-member helper functions called make_u32regex_iterator:
244
245 u32regex_iterator<const char*>
246 make_u32regex_iterator(const char* s,
247 const u32regex& e,
248 regex_constants::match_flag_type m = regex_constants::match_default);
249
250 u32regex_iterator<const wchar_t*>
251 make_u32regex_iterator(const wchar_t* s,
252 const u32regex& e,
253 regex_constants::match_flag_type m = regex_constants::match_default);
254
255 u32regex_iterator<const UChar*>
256 make_u32regex_iterator(const UChar* s,
257 const u32regex& e,
258 regex_constants::match_flag_type m = regex_constants::match_default);
259
260 template <class charT, class Traits, class Alloc>
261 u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
262 make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s,
263 const u32regex& e,
264 regex_constants::match_flag_type m = regex_constants::match_default);
265
266 u32regex_iterator<const UChar*>
267 make_u32regex_iterator(const UnicodeString& s,
268 const u32regex& e,
269 regex_constants::match_flag_type m = regex_constants::match_default);
270
271 Each of these overloads returns an iterator that enumerates all occurrences
272 of expression /e/, in text /s/, using match_flags /m/.
273
274 Example: search for international currency symbols, along with their associated numeric value:
275
276 void enumerate_currencies(const std::string& text)
277 {
278 // enumerate and print all the currency symbols, along
279 // with any associated numeric values:
280 const char* re =
281 "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
282 "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
283 "(?(1)"
284 "|(?(2)"
285 "[[:Cf:][:Cc:][:Z*:]]*"
286 ")"
287 "[[:Sc:]]"
288 ")";
289 boost::u32regex r = boost::make_u32regex(re);
290 boost::u32regex_iterator<std::string::const_iterator>
291 i(boost::make_u32regex_iterator(text, r)), j;
292 while(i != j)
293 {
294 std::cout << (*i)[0] << std::endl;
295 ++i;
296 }
297 }
298
299 Calling
300
301 [/this doesn't format correctly as code:]
302 [pre enumerate_currencies(" $100.23 or '''&#xA3;'''198.12 ");]
303
304 Yields the output:
305
306 [pre
307 $100.23
308 '''&#xA3;'''198.12
309 ]
310
311 Provided of course that the input is encoded as UTF-8.
312
313 [h4 u32regex_token_iterator]
314
315 Type `u32regex_token_iterator` is in all respects the same as [regex_token_iterator]
316 except that since the regular expression type is always `u32regex` it only
317 takes one template parameter (the iterator type). It also calls
318 `u32regex_search` internally, allowing it to interface correctly with UTF-8,
319 UTF-16, and UTF-32 data:
320
321 template <class BidirectionalIterator>
322 class u32regex_token_iterator
323 {
324 // for members see regex_token_iterator
325 };
326
327 typedef u32regex_token_iterator<const char*> utf8regex_token_iterator;
328 typedef u32regex_token_iterator<const UChar*> utf16regex_token_iterator;
329 typedef u32regex_token_iterator<const UChar32*> utf32regex_token_iterator;
330
331 In order to simplify the construction of a `u32regex_token_iterator` from a string,
332 there are a series of non-member helper functions called `make_u32regex_token_iterator`:
333
334 u32regex_token_iterator<const char*>
335 make_u32regex_token_iterator(
336 const char* s,
337 const u32regex& e,
338 int sub,
339 regex_constants::match_flag_type m = regex_constants::match_default);
340
341 u32regex_token_iterator<const wchar_t*>
342 make_u32regex_token_iterator(
343 const wchar_t* s,
344 const u32regex& e,
345 int sub,
346 regex_constants::match_flag_type m = regex_constants::match_default);
347
348 u32regex_token_iterator<const UChar*>
349 make_u32regex_token_iterator(
350 const UChar* s,
351 const u32regex& e,
352 int sub,
353 regex_constants::match_flag_type m = regex_constants::match_default);
354
355 template <class charT, class Traits, class Alloc>
356 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
357 make_u32regex_token_iterator(
358 const std::basic_string<charT, Traits, Alloc>& s,
359 const u32regex& e,
360 int sub,
361 regex_constants::match_flag_type m = regex_constants::match_default);
362
363 u32regex_token_iterator<const UChar*>
364 make_u32regex_token_iterator(
365 const UnicodeString& s,
366 const u32regex& e,
367 int sub,
368 regex_constants::match_flag_type m = regex_constants::match_default);
369
370 Each of these overloads returns an iterator that enumerates all occurrences of
371 marked sub-expression sub in regular expression /e/, found in text /s/, using
372 match_flags /m/.
373
374 template <std::size_t N>
375 u32regex_token_iterator<const char*>
376 make_u32regex_token_iterator(
377 const char* p,
378 const u32regex& e,
379 const int (&submatch)[N],
380 regex_constants::match_flag_type m = regex_constants::match_default);
381
382 template <std::size_t N>
383 u32regex_token_iterator<const wchar_t*>
384 make_u32regex_token_iterator(
385 const wchar_t* p,
386 const u32regex& e,
387 const int (&submatch)[N],
388 regex_constants::match_flag_type m = regex_constants::match_default);
389
390 template <std::size_t N>
391 u32regex_token_iterator<const UChar*>
392 make_u32regex_token_iterator(
393 const UChar* p,
394 const u32regex& e,
395 const int (&submatch)[N],
396 regex_constants::match_flag_type m = regex_constants::match_default);
397
398 template <class charT, class Traits, class Alloc, std::size_t N>
399 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
400 make_u32regex_token_iterator(
401 const std::basic_string<charT, Traits, Alloc>& p,
402 const u32regex& e,
403 const int (&submatch)[N],
404 regex_constants::match_flag_type m = regex_constants::match_default);
405
406 template <std::size_t N>
407 u32regex_token_iterator<const UChar*>
408 make_u32regex_token_iterator(
409 const UnicodeString& s,
410 const u32regex& e,
411 const int (&submatch)[N],
412 regex_constants::match_flag_type m = regex_constants::match_default);
413
414 Each of these overloads returns an iterator that enumerates one sub-expression
415 for each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
416
417 u32regex_token_iterator<const char*>
418 make_u32regex_token_iterator(
419 const char* p,
420 const u32regex& e,
421 const std::vector<int>& submatch,
422 regex_constants::match_flag_type m = regex_constants::match_default);
423
424 u32regex_token_iterator<const wchar_t*>
425 make_u32regex_token_iterator(
426 const wchar_t* p,
427 const u32regex& e,
428 const std::vector<int>& submatch,
429 regex_constants::match_flag_type m = regex_constants::match_default);
430
431 u32regex_token_iterator<const UChar*>
432 make_u32regex_token_iterator(
433 const UChar* p,
434 const u32regex& e,
435 const std::vector<int>& submatch,
436 regex_constants::match_flag_type m = regex_constants::match_default);
437
438 template <class charT, class Traits, class Alloc>
439 u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
440 make_u32regex_token_iterator(
441 const std::basic_string<charT, Traits, Alloc>& p,
442 const u32regex& e,
443 const std::vector<int>& submatch,
444 regex_constants::match_flag_type m = regex_constants::match_default);
445
446 u32regex_token_iterator<const UChar*>
447 make_u32regex_token_iterator(
448 const UnicodeString& s,
449 const u32regex& e,
450 const std::vector<int>& submatch,
451 regex_constants::match_flag_type m = regex_constants::match_default);
452
453 Each of these overloads returns an iterator that enumerates one sub-expression for
454 each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
455
456 Example: search for international currency symbols, along with their associated numeric value:
457
458 void enumerate_currencies2(const std::string& text)
459 {
460 // enumerate and print all the currency symbols, along
461 // with any associated numeric values:
462 const char* re =
463 "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
464 "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
465 "(?(1)"
466 "|(?(2)"
467 "[[:Cf:][:Cc:][:Z*:]]*"
468 ")"
469 "[[:Sc:]]"
470 ")";
471 boost::u32regex r = boost::make_u32regex(re);
472 boost::u32regex_token_iterator<std::string::const_iterator>
473 i(boost::make_u32regex_token_iterator(text, r, 1)), j;
474 while(i != j)
475 {
476 std::cout << *i << std::endl;
477 ++i;
478 }
479 }
480
481 [endsect]
482
483 [endsect]
484