ceph/src/boost/libs/regex/doc/icu_strings.qbk

   1 [/
   2   Copyright 2006-2007 John Maddock.
   3   Distributed under the Boost Software License, Version 1.0.
   4   (See accompanying file LICENSE_1_0.txt or copy at
   5   http://www.boost.org/LICENSE_1_0.txt).
   6 ]
   7
   8
   9 [section:icu Working With Unicode and ICU String Types]
  10
  11 [section:intro Introduction to using Regex with ICU]
  12
  13 The header:
  14
  15    <boost/regex/icu.hpp>
  16
  17 contains the data types and algorithms necessary for working with regular
  18 expressions in a Unicode aware environment.
  19
  20 In order to use this header you will need the
  21 [@http://www.ibm.com/software/globalization/icu/ ICU library], and you will need
  22 to have built the Boost.Regex library with
  23 [link boost_regex.install.building_with_unicode_and_icu_su ICU support enabled].
  24
  25 The header will enable you to:
  26
  27 * Create regular expressions that treat Unicode strings as sequences of UTF-32 code points.
  28 * Create regular expressions that support various Unicode data properties, including character classification.
  29 * Transparently search Unicode strings that are encoded as either UTF-8, UTF-16 or UTF-32.
  30
  31 [endsect]
  32
  33 [section:unicode_types Unicode regular expression types]
  34
  35 Header `<boost/regex/icu.hpp>` provides a regular expression traits class that
  36 handles UTF-32 characters:
  37
  38    class icu_regex_traits;
  39
  40 and a regular expression type based upon that:
  41
  42    typedef basic_regex<UChar32,icu_regex_traits> u32regex;
  43
  44 The type `u32regex` is regular expression type to use for all Unicode
  45 regular expressions; internally it uses UTF-32 code points, but can be
  46 created from, and used to search, either UTF-8, or UTF-16 encoded strings
  47 as well as UTF-32 ones.
  48
  49 The constructors, and assign member functions of `u32regex`, require UTF-32
  50 encoded strings, but there are a series of overloaded algorithms called
  51 `make_u32regex` which allow regular expressions to be created from
  52 UTF-8, UTF-16, or UTF-32 encoded strings:
  53
  54    template <class InputIterator>
  55    u32regex make_u32regex(InputIterator i,
  56                           InputIterator j,
  57                           boost::regex_constants::syntax_option_type opt);
  58
  59 [*Effects]: Creates a regular expression object from the iterator sequence \[i,j).
  60 The character encoding of the sequence is determined based upon sizeof(*i):
  61 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
  62
  63    u32regex make_u32regex(const char* p,
  64                           boost::regex_constants::syntax_option_type opt
  65                               = boost::regex_constants::perl);
  66
  67 [*Effects]: Creates a regular expression object from the Null-terminated
  68 UTF-8 character sequence /p/.
  69
  70    u32regex make_u32regex(const unsigned char* p,
  71                           boost::regex_constants::syntax_option_type opt
  72                               = boost::regex_constants::perl);
  73
  74 [*Effects]: Creates a regular expression object from the Null-terminated UTF-8 character sequence p.
  75
  76    u32regex make_u32regex(const wchar_t* p,
  77                           boost::regex_constants::syntax_option_type opt
  78                               = boost::regex_constants::perl);
  79
  80 [*Effects]: Creates a regular expression object from the Null-terminated character sequence p.  The character encoding of the sequence is determined based upon sizeof(wchar_t): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
  81
  82    u32regex make_u32regex(const UChar* p,
  83                           boost::regex_constants::syntax_option_type opt
  84                               = boost::regex_constants::perl);
  85
  86 [*Effects]: Creates a regular expression object from the Null-terminated UTF-16 character sequence p.
  87
  88    template<class C, class T, class A>
  89    u32regex make_u32regex(const std::basic_string<C, T, A>& s,
  90                           boost::regex_constants::syntax_option_type opt
  91                               = boost::regex_constants::perl);
  92
  93 [*Effects]: Creates a regular expression object from the string s.  The character encoding of the string is determined based upon sizeof(C): 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.
  94
  95    u32regex make_u32regex(const UnicodeString& s,
  96                           boost::regex_constants::syntax_option_type opt
  97                               = boost::regex_constants::perl);
  98
  99 [*Effects]: Creates a regular expression object from the UTF-16 encoding string s.
 100
 101 [endsect]
 102
 103 [section:unicode_algo Unicode Regular Expression Algorithms]
 104
 105 The regular expression algorithms [regex_match], [regex_search] and [regex_replace]
 106 all expect that the character sequence upon which they operate,
 107 is encoded in the same character encoding as the regular expression object
 108 with which they are used.  For Unicode regular expressions that behavior is
 109 undesirable: while we may want to process the data in UTF-32 "chunks", the
 110 actual data is much more likely to encoded as either UTF-8 or UTF-16.
 111 Therefore the header <boost/regex/icu.hpp> provides a series of thin wrappers
 112 around these algorithms, called `u32regex_match`, `u32regex_search`, and
 113 `u32regex_replace`.  These wrappers use iterator-adapters internally to
 114 make external UTF-8 or UTF-16 data look as though it's really a UTF-32 sequence,
 115 that can then be passed on to the "real" algorithm.
 116
 117 [h4 u32regex_match]
 118
 119 For each [regex_match] algorithm defined by `<boost/regex.hpp>`, then
 120 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
 121 same arguments, but which is called `u32regex_match`, and which will
 122 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
 123 ICU UnicodeString as input.
 124
 125 Example: match a password, encoded in a UTF-16 UnicodeString:
 126
 127    //
 128    // Find out if *password* meets our password requirements,
 129    // as defined by the regular expression *requirements*.
 130    //
 131    bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements)
 132    {
 133       return boost::u32regex_match(password, boost::make_u32regex(requirements));
 134    }
 135
 136 Example: match a UTF-8 encoded filename:
 137
 138    //
 139    // Extract filename part of a path from a UTF-8 encoded std::string and return the result
 140    // as another std::string:
 141    //
 142    std::string get_filename(const std::string& path)
 143    {
 144       boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)");
 145       boost::smatch what;
 146       if(boost::u32regex_match(path, what, r))
 147       {
 148          // extract $1 as a std::string:
 149          return what.str(1);
 150       }
 151       else
 152       {
 153          throw std::runtime_error("Invalid pathname");
 154       }
 155    }
 156
 157 [h4 u32regex_search]
 158
 159 For each [regex_search] algorithm defined by `<boost/regex.hpp>`, then
 160 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes the
 161 same arguments, but which is called `u32regex_search`, and which will
 162 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
 163 UnicodeString as input.
 164
 165 Example: search for a character sequence in a specific language block:
 166
 167    UnicodeString extract_greek(const UnicodeString& text)
 168    {
 169       // searches through some UTF-16 encoded text for a block encoded in Greek,
 170       // this expression is imperfect, but the best we can do for now - searching
 171       // for specific scripts is actually pretty hard to do right.
 172       //
 173       // Here we search for a character sequence that begins with a Greek letter,
 174       // and continues with characters that are either not-letters ( [^[:L*:]] )
 175       // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ).
 176       //
 177       boost::u32regex r = boost::make_u32regex(
 178             L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*");
 179       boost::u16match what;
 180       if(boost::u32regex_search(text, what, r))
 181       {
 182          // extract $0 as a UnicodeString:
 183          return UnicodeString(what[0].first, what.length(0));
 184       }
 185       else
 186       {
 187          throw std::runtime_error("No Greek found!");
 188       }
 189    }
 190
 191 [h4 u32regex_replace]
 192
 193 For each [regex_replace] algorithm defined by `<boost/regex.hpp>`, then
 194 `<boost/regex/icu.hpp>` defines an overloaded algorithm that takes
 195 the same arguments, but which is called `u32regex_replace`, and which will
 196 accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an ICU
 197 UnicodeString as input.  The input sequence and the format string specifier
 198 passed to the algorithm, can be encoded independently (for example one can
 199 be UTF-8, the other in UTF-16), but the result string / output iterator
 200 argument must use the same character encoding as the text being searched.
 201
 202 Example: Credit card number reformatting:
 203
 204    //
 205    // Take a credit card number as a string of digits,
 206    // and reformat it as a human readable string with "-"
 207    // separating each group of four digit;,
 208    // note that we're mixing a UTF-32 regex, with a UTF-16
 209    // string and a UTF-8 format specifier, and it still all
 210    // just works:
 211    //
 212    const boost::u32regex e = boost::make_u32regex(
 213          "\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z");
 214    const char* human_format = "$1-$2-$3-$4";
 215
 216    UnicodeString human_readable_card_number(const UnicodeString& s)
 217    {
 218       return boost::u32regex_replace(s, e, human_format);
 219    }
 220
 221 [endsect]
 222 [section:unicode_iter Unicode Aware Regex Iterators]
 223
 224 [h4 u32regex_iterator]
 225
 226 Type `u32regex_iterator` is in all respects the same as [regex_iterator]
 227 except that since the regular expression type is always `u32regex`
 228 it only takes one template parameter (the iterator type). It also calls
 229 `u32regex_search` internally, allowing it to interface correctly with
 230 UTF-8, UTF-16, and UTF-32 data:
 231
 232    template <class BidirectionalIterator>
 233    class u32regex_iterator
 234    {
 235       // for members see regex_iterator
 236    };
 237
 238    typedef u32regex_iterator<const char*>     utf8regex_iterator;
 239    typedef u32regex_iterator<const UChar*>    utf16regex_iterator;
 240    typedef u32regex_iterator<const UChar32*>  utf32regex_iterator;
 241
 242 In order to simplify the construction of a `u32regex_iterator` from a string,
 243 there are a series of non-member helper functions called make_u32regex_iterator:
 244
 245    u32regex_iterator<const char*>
 246       make_u32regex_iterator(const char* s,
 247                              const u32regex& e,
 248                              regex_constants::match_flag_type m = regex_constants::match_default);
 249
 250    u32regex_iterator<const wchar_t*>
 251       make_u32regex_iterator(const wchar_t* s,
 252                              const u32regex& e,
 253                              regex_constants::match_flag_type m = regex_constants::match_default);
 254
 255    u32regex_iterator<const UChar*>
 256       make_u32regex_iterator(const UChar* s,
 257                              const u32regex& e,
 258                              regex_constants::match_flag_type m = regex_constants::match_default);
 259
 260    template <class charT, class Traits, class Alloc>
 261    u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
 262       make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s,
 263                              const u32regex& e,
 264                              regex_constants::match_flag_type m = regex_constants::match_default);
 265
 266    u32regex_iterator<const UChar*>
 267       make_u32regex_iterator(const UnicodeString& s,
 268                              const u32regex& e,
 269                              regex_constants::match_flag_type m = regex_constants::match_default);
 270
 271 Each of these overloads returns an iterator that enumerates all occurrences
 272 of expression /e/, in text /s/, using match_flags /m/.
 273
 274 Example: search for international currency symbols, along with their associated numeric value:
 275
 276    void enumerate_currencies(const std::string& text)
 277    {
 278       // enumerate and print all the currency symbols, along
 279       // with any associated numeric values:
 280       const char* re =
 281          "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
 282          "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
 283          "(?(1)"
 284             "|(?(2)"
 285                "[[:Cf:][:Cc:][:Z*:]]*"
 286             ")"
 287             "[[:Sc:]]"
 288          ")";
 289       boost::u32regex r = boost::make_u32regex(re);
 290       boost::u32regex_iterator<std::string::const_iterator>
 291             i(boost::make_u32regex_iterator(text, r)), j;
 292       while(i != j)
 293       {
 294          std::cout << (*i)[0] << std::endl;
 295          ++i;
 296       }
 297    }
 298
 299 Calling
 300
 301 [/this doesn't format correctly as code:]
 302 [pre enumerate_currencies(" $100.23 or '''&#xA3;'''198.12 ");]
 303
 304 Yields the output:
 305
 306 [pre
 307 $100.23
 308 '''&#xA3;'''198.12
 309 ]
 310
 311 Provided of course that the input is encoded as UTF-8.
 312
 313 [h4 u32regex_token_iterator]
 314
 315 Type `u32regex_token_iterator` is in all respects the same as [regex_token_iterator]
 316 except that since the regular expression type is always `u32regex` it only
 317 takes one template parameter (the iterator type).  It also calls
 318 `u32regex_search` internally, allowing it to interface correctly with UTF-8,
 319 UTF-16, and UTF-32 data:
 320
 321    template <class BidirectionalIterator>
 322    class u32regex_token_iterator
 323    {
 324       // for members see regex_token_iterator
 325    };
 326
 327    typedef u32regex_token_iterator<const char*>     utf8regex_token_iterator;
 328    typedef u32regex_token_iterator<const UChar*>    utf16regex_token_iterator;
 329    typedef u32regex_token_iterator<const UChar32*>  utf32regex_token_iterator;
 330
 331 In order to simplify the construction of a `u32regex_token_iterator` from a string,
 332 there are a series of non-member helper functions called `make_u32regex_token_iterator`:
 333
 334    u32regex_token_iterator<const char*>
 335       make_u32regex_token_iterator(
 336             const char* s,
 337             const u32regex& e,
 338             int sub,
 339             regex_constants::match_flag_type m = regex_constants::match_default);
 340
 341    u32regex_token_iterator<const wchar_t*>
 342       make_u32regex_token_iterator(
 343             const wchar_t* s,
 344             const u32regex& e,
 345             int sub,
 346             regex_constants::match_flag_type m = regex_constants::match_default);
 347
 348    u32regex_token_iterator<const UChar*>
 349       make_u32regex_token_iterator(
 350             const UChar* s,
 351             const u32regex& e,
 352             int sub,
 353             regex_constants::match_flag_type m = regex_constants::match_default);
 354
 355    template <class charT, class Traits, class Alloc>
 356    u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
 357       make_u32regex_token_iterator(
 358             const std::basic_string<charT, Traits, Alloc>& s,
 359             const u32regex& e,
 360             int sub,
 361             regex_constants::match_flag_type m = regex_constants::match_default);
 362
 363    u32regex_token_iterator<const UChar*>
 364       make_u32regex_token_iterator(
 365             const UnicodeString& s,
 366             const u32regex& e,
 367             int sub,
 368             regex_constants::match_flag_type m = regex_constants::match_default);
 369
 370 Each of these overloads returns an iterator that enumerates all occurrences of
 371 marked sub-expression sub in regular expression /e/, found in text /s/, using
 372 match_flags /m/.
 373
 374    template <std::size_t N>
 375    u32regex_token_iterator<const char*>
 376       make_u32regex_token_iterator(
 377             const char* p,
 378             const u32regex& e,
 379             const int (&submatch)[N],
 380             regex_constants::match_flag_type m = regex_constants::match_default);
 381
 382    template <std::size_t N>
 383    u32regex_token_iterator<const wchar_t*>
 384       make_u32regex_token_iterator(
 385             const wchar_t* p,
 386             const u32regex& e,
 387             const int (&submatch)[N],
 388             regex_constants::match_flag_type m = regex_constants::match_default);
 389
 390    template <std::size_t N>
 391    u32regex_token_iterator<const UChar*>
 392       make_u32regex_token_iterator(
 393             const UChar* p,
 394             const u32regex& e,
 395             const int (&submatch)[N],
 396             regex_constants::match_flag_type m = regex_constants::match_default);
 397
 398    template <class charT, class Traits, class Alloc, std::size_t N>
 399    u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
 400       make_u32regex_token_iterator(
 401             const std::basic_string<charT, Traits, Alloc>& p,
 402             const u32regex& e,
 403             const int (&submatch)[N],
 404             regex_constants::match_flag_type m = regex_constants::match_default);
 405
 406    template <std::size_t N>
 407    u32regex_token_iterator<const UChar*>
 408       make_u32regex_token_iterator(
 409             const UnicodeString& s,
 410             const u32regex& e,
 411             const int (&submatch)[N],
 412             regex_constants::match_flag_type m = regex_constants::match_default);
 413
 414 Each of these overloads returns an iterator that enumerates one sub-expression
 415 for each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
 416
 417    u32regex_token_iterator<const char*>
 418       make_u32regex_token_iterator(
 419             const char* p,
 420             const u32regex& e,
 421             const std::vector<int>& submatch,
 422             regex_constants::match_flag_type m = regex_constants::match_default);
 423
 424    u32regex_token_iterator<const wchar_t*>
 425       make_u32regex_token_iterator(
 426             const wchar_t* p,
 427             const u32regex& e,
 428             const std::vector<int>& submatch,
 429             regex_constants::match_flag_type m = regex_constants::match_default);
 430
 431    u32regex_token_iterator<const UChar*>
 432       make_u32regex_token_iterator(
 433             const UChar* p,
 434             const u32regex& e,
 435             const std::vector<int>& submatch,
 436             regex_constants::match_flag_type m = regex_constants::match_default);
 437
 438    template <class charT, class Traits, class Alloc>
 439    u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>
 440       make_u32regex_token_iterator(
 441             const std::basic_string<charT, Traits, Alloc>& p,
 442             const u32regex& e,
 443             const std::vector<int>& submatch,
 444             regex_constants::match_flag_type m = regex_constants::match_default);
 445
 446    u32regex_token_iterator<const UChar*>
 447       make_u32regex_token_iterator(
 448             const UnicodeString& s,
 449             const u32regex& e,
 450             const std::vector<int>& submatch,
 451             regex_constants::match_flag_type m = regex_constants::match_default);
 452
 453 Each of these overloads returns an iterator that enumerates one sub-expression for
 454 each submatch in regular expression /e/, found in text /s/, using match_flags /m/.
 455
 456 Example: search for international currency symbols, along with their associated numeric value:
 457
 458    void enumerate_currencies2(const std::string& text)
 459    {
 460       // enumerate and print all the currency symbols, along
 461       // with any associated numeric values:
 462       const char* re =
 463          "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
 464          "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
 465          "(?(1)"
 466             "|(?(2)"
 467                "[[:Cf:][:Cc:][:Z*:]]*"
 468             ")"
 469             "[[:Sc:]]"
 470          ")";
 471       boost::u32regex r = boost::make_u32regex(re);
 472       boost::u32regex_token_iterator<std::string::const_iterator>
 473          i(boost::make_u32regex_token_iterator(text, r, 1)), j;
 474       while(i != j)
 475       {
 476          std::cout << *i << std::endl;
 477          ++i;
 478       }
 479    }
 480
 481 [endsect]
 482
 483 [endsect]
 484