]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////////////////////////////////////////////////////////////////////////// |
2 | /// \file regex_compiler.hpp | |
3 | /// Contains the definition of regex_compiler, a factory for building regex objects | |
4 | /// from strings. | |
5 | // | |
6 | // Copyright 2008 Eric Niebler. Distributed under the Boost | |
7 | // Software License, Version 1.0. (See accompanying file | |
8 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
9 | ||
10 | #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 | |
11 | #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 | |
12 | ||
13 | // MS compatible compilers support #pragma once | |
14 | #if defined(_MSC_VER) | |
15 | # pragma once | |
16 | #endif | |
17 | ||
18 | #include <map> | |
19 | #include <boost/config.hpp> | |
20 | #include <boost/assert.hpp> | |
21 | #include <boost/next_prior.hpp> | |
22 | #include <boost/range/begin.hpp> | |
23 | #include <boost/range/end.hpp> | |
24 | #include <boost/mpl/assert.hpp> | |
25 | #include <boost/throw_exception.hpp> | |
26 | #include <boost/type_traits/is_same.hpp> | |
27 | #include <boost/type_traits/is_pointer.hpp> | |
28 | #include <boost/utility/enable_if.hpp> | |
29 | #include <boost/iterator/iterator_traits.hpp> | |
30 | #include <boost/xpressive/basic_regex.hpp> | |
31 | #include <boost/xpressive/detail/dynamic/parser.hpp> | |
32 | #include <boost/xpressive/detail/dynamic/parse_charset.hpp> | |
33 | #include <boost/xpressive/detail/dynamic/parser_enum.hpp> | |
34 | #include <boost/xpressive/detail/dynamic/parser_traits.hpp> | |
35 | #include <boost/xpressive/detail/core/linker.hpp> | |
36 | #include <boost/xpressive/detail/core/optimize.hpp> | |
37 | ||
38 | namespace boost { namespace xpressive | |
39 | { | |
40 | ||
41 | /////////////////////////////////////////////////////////////////////////////// | |
42 | // regex_compiler | |
43 | // | |
44 | /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string. | |
45 | /// | |
46 | /// Class template regex_compiler is used to construct a basic_regex object from a string. The string | |
47 | /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale, | |
48 | /// after which all basic_regex objects created with that regex_compiler object will use that locale. | |
49 | /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the | |
50 | /// compile() method to construct a basic_regex object, passing it the string representing the regular | |
51 | /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex | |
52 | /// objects compiled from the same string will have different regex_id's. | |
53 | template<typename BidiIter, typename RegexTraits, typename CompilerTraits> | |
54 | struct regex_compiler | |
55 | { | |
56 | typedef BidiIter iterator_type; | |
57 | typedef typename iterator_value<BidiIter>::type char_type; | |
58 | typedef regex_constants::syntax_option_type flag_type; | |
59 | typedef RegexTraits traits_type; | |
60 | typedef typename traits_type::string_type string_type; | |
61 | typedef typename traits_type::locale_type locale_type; | |
62 | typedef typename traits_type::char_class_type char_class_type; | |
63 | ||
64 | explicit regex_compiler(RegexTraits const &traits = RegexTraits()) | |
65 | : mark_count_(0) | |
66 | , hidden_mark_count_(0) | |
67 | , traits_(traits) | |
68 | , upper_(0) | |
69 | , self_() | |
70 | , rules_() | |
71 | { | |
72 | this->upper_ = lookup_classname(this->rxtraits(), "upper"); | |
73 | } | |
74 | ||
75 | /////////////////////////////////////////////////////////////////////////// | |
76 | // imbue | |
77 | /// Specify the locale to be used by a regex_compiler. | |
78 | /// | |
79 | /// \param loc The locale that this regex_compiler should use. | |
80 | /// \return The previous locale. | |
81 | locale_type imbue(locale_type loc) | |
82 | { | |
83 | locale_type oldloc = this->traits_.imbue(loc); | |
84 | this->upper_ = lookup_classname(this->rxtraits(), "upper"); | |
85 | return oldloc; | |
86 | } | |
87 | ||
88 | /////////////////////////////////////////////////////////////////////////// | |
89 | // getloc | |
90 | /// Get the locale used by a regex_compiler. | |
91 | /// | |
92 | /// \return The locale used by this regex_compiler. | |
93 | locale_type getloc() const | |
94 | { | |
95 | return this->traits_.getloc(); | |
96 | } | |
97 | ||
98 | /////////////////////////////////////////////////////////////////////////// | |
99 | // compile | |
100 | /// Builds a basic_regex object from a range of characters. | |
101 | /// | |
102 | /// \param begin The beginning of a range of characters representing the | |
103 | /// regular expression to compile. | |
104 | /// \param end The end of a range of characters representing the | |
105 | /// regular expression to compile. | |
106 | /// \param flags Optional bitmask that determines how the pat string is | |
107 | /// interpreted. (See syntax_option_type.) | |
108 | /// \return A basic_regex object corresponding to the regular expression | |
109 | /// represented by the character range. | |
110 | /// \pre InputIter is a model of the InputIterator concept. | |
111 | /// \pre [begin,end) is a valid range. | |
112 | /// \pre The range of characters specified by [begin,end) contains a | |
113 | /// valid string-based representation of a regular expression. | |
114 | /// \throw regex_error when the range of characters has invalid regular | |
115 | /// expression syntax. | |
116 | template<typename InputIter> | |
117 | basic_regex<BidiIter> | |
118 | compile(InputIter begin, InputIter end, flag_type flags = regex_constants::ECMAScript) | |
119 | { | |
120 | typedef typename iterator_category<InputIter>::type category; | |
121 | return this->compile_(begin, end, flags, category()); | |
122 | } | |
123 | ||
124 | /// \overload | |
125 | /// | |
126 | template<typename InputRange> | |
127 | typename disable_if<is_pointer<InputRange>, basic_regex<BidiIter> >::type | |
128 | compile(InputRange const &pat, flag_type flags = regex_constants::ECMAScript) | |
129 | { | |
130 | return this->compile(boost::begin(pat), boost::end(pat), flags); | |
131 | } | |
132 | ||
133 | /// \overload | |
134 | /// | |
135 | basic_regex<BidiIter> | |
136 | compile(char_type const *begin, flag_type flags = regex_constants::ECMAScript) | |
137 | { | |
138 | BOOST_ASSERT(0 != begin); | |
139 | char_type const *end = begin + std::char_traits<char_type>::length(begin); | |
140 | return this->compile(begin, end, flags); | |
141 | } | |
142 | ||
143 | /// \overload | |
144 | /// | |
145 | basic_regex<BidiIter> compile(char_type const *begin, std::size_t size, flag_type flags) | |
146 | { | |
147 | BOOST_ASSERT(0 != begin); | |
148 | char_type const *end = begin + size; | |
149 | return this->compile(begin, end, flags); | |
150 | } | |
151 | ||
152 | /////////////////////////////////////////////////////////////////////////// | |
153 | // operator[] | |
154 | /// Return a reference to the named regular expression. If no such named | |
155 | /// regular expression exists, create a new regular expression and return | |
156 | /// a reference to it. | |
157 | /// | |
158 | /// \param name A std::string containing the name of the regular expression. | |
159 | /// \pre The string is not empty. | |
160 | /// \throw bad_alloc on allocation failure. | |
161 | basic_regex<BidiIter> &operator [](string_type const &name) | |
162 | { | |
163 | BOOST_ASSERT(!name.empty()); | |
164 | return this->rules_[name]; | |
165 | } | |
166 | ||
167 | /// \overload | |
168 | /// | |
169 | basic_regex<BidiIter> const &operator [](string_type const &name) const | |
170 | { | |
171 | BOOST_ASSERT(!name.empty()); | |
172 | return this->rules_[name]; | |
173 | } | |
174 | ||
175 | private: | |
176 | ||
177 | typedef detail::escape_value<char_type, char_class_type> escape_value; | |
178 | typedef detail::alternate_matcher<detail::alternates_vector<BidiIter>, RegexTraits> alternate_matcher; | |
179 | ||
180 | /////////////////////////////////////////////////////////////////////////// | |
181 | // compile_ | |
182 | /// INTERNAL ONLY | |
183 | template<typename FwdIter> | |
184 | basic_regex<BidiIter> compile_(FwdIter begin, FwdIter end, flag_type flags, std::forward_iterator_tag) | |
185 | { | |
186 | BOOST_MPL_ASSERT((is_same<char_type, typename iterator_value<FwdIter>::type>)); | |
187 | using namespace regex_constants; | |
188 | this->reset(); | |
189 | this->traits_.flags(flags); | |
190 | ||
191 | basic_regex<BidiIter> rextmp, *prex = &rextmp; | |
192 | FwdIter tmp = begin; | |
193 | ||
194 | // Check if this regex is a named rule: | |
195 | string_type name; | |
196 | if(token_group_begin == this->traits_.get_token(tmp, end) && | |
197 | BOOST_XPR_ENSURE_(tmp != end, error_paren, "mismatched parenthesis") && | |
198 | token_rule_assign == this->traits_.get_group_type(tmp, end, name)) | |
199 | { | |
200 | begin = tmp; | |
201 | BOOST_XPR_ENSURE_ | |
202 | ( | |
203 | begin != end && token_group_end == this->traits_.get_token(begin, end) | |
204 | , error_paren | |
205 | , "mismatched parenthesis" | |
206 | ); | |
207 | prex = &this->rules_[name]; | |
208 | } | |
209 | ||
210 | this->self_ = detail::core_access<BidiIter>::get_regex_impl(*prex); | |
211 | ||
212 | // at the top level, a regex is a sequence of alternates | |
213 | detail::sequence<BidiIter> seq = this->parse_alternates(begin, end); | |
214 | BOOST_XPR_ENSURE_(begin == end, error_paren, "mismatched parenthesis"); | |
215 | ||
216 | // terminate the sequence | |
217 | seq += detail::make_dynamic<BidiIter>(detail::end_matcher()); | |
218 | ||
219 | // bundle the regex information into a regex_impl object | |
220 | detail::common_compile(seq.xpr().matchable(), *this->self_, this->rxtraits()); | |
221 | ||
222 | this->self_->traits_ = new detail::traits_holder<RegexTraits>(this->rxtraits()); | |
223 | this->self_->mark_count_ = this->mark_count_; | |
224 | this->self_->hidden_mark_count_ = this->hidden_mark_count_; | |
225 | ||
226 | // References changed, update dependencies. | |
227 | this->self_->tracking_update(); | |
228 | this->self_.reset(); | |
229 | return *prex; | |
230 | } | |
231 | ||
232 | /////////////////////////////////////////////////////////////////////////// | |
233 | // compile_ | |
234 | /// INTERNAL ONLY | |
235 | template<typename InputIter> | |
236 | basic_regex<BidiIter> compile_(InputIter begin, InputIter end, flag_type flags, std::input_iterator_tag) | |
237 | { | |
238 | string_type pat(begin, end); | |
239 | return this->compile_(boost::begin(pat), boost::end(pat), flags, std::forward_iterator_tag()); | |
240 | } | |
241 | ||
242 | /////////////////////////////////////////////////////////////////////////// | |
243 | // reset | |
244 | /// INTERNAL ONLY | |
245 | void reset() | |
246 | { | |
247 | this->mark_count_ = 0; | |
248 | this->hidden_mark_count_ = 0; | |
249 | this->traits_.flags(regex_constants::ECMAScript); | |
250 | } | |
251 | ||
252 | /////////////////////////////////////////////////////////////////////////// | |
253 | // regex_traits | |
254 | /// INTERNAL ONLY | |
255 | traits_type &rxtraits() | |
256 | { | |
257 | return this->traits_.traits(); | |
258 | } | |
259 | ||
260 | /////////////////////////////////////////////////////////////////////////// | |
261 | // regex_traits | |
262 | /// INTERNAL ONLY | |
263 | traits_type const &rxtraits() const | |
264 | { | |
265 | return this->traits_.traits(); | |
266 | } | |
267 | ||
268 | /////////////////////////////////////////////////////////////////////////// | |
269 | // parse_alternates | |
270 | /// INTERNAL ONLY | |
271 | template<typename FwdIter> | |
272 | detail::sequence<BidiIter> parse_alternates(FwdIter &begin, FwdIter end) | |
273 | { | |
274 | using namespace regex_constants; | |
275 | int count = 0; | |
276 | FwdIter tmp = begin; | |
277 | detail::sequence<BidiIter> seq; | |
278 | ||
279 | do switch(++count) | |
280 | { | |
281 | case 1: | |
282 | seq = this->parse_sequence(tmp, end); | |
283 | break; | |
284 | case 2: | |
285 | seq = detail::make_dynamic<BidiIter>(alternate_matcher()) | seq; | |
286 | BOOST_FALLTHROUGH; | |
287 | default: | |
288 | seq |= this->parse_sequence(tmp, end); | |
289 | } | |
290 | while((begin = tmp) != end && token_alternate == this->traits_.get_token(tmp, end)); | |
291 | ||
292 | return seq; | |
293 | } | |
294 | ||
295 | /////////////////////////////////////////////////////////////////////////// | |
296 | // parse_group | |
297 | /// INTERNAL ONLY | |
298 | template<typename FwdIter> | |
299 | detail::sequence<BidiIter> parse_group(FwdIter &begin, FwdIter end) | |
300 | { | |
301 | using namespace regex_constants; | |
302 | int mark_nbr = 0; | |
303 | bool keeper = false; | |
304 | bool lookahead = false; | |
305 | bool lookbehind = false; | |
306 | bool negative = false; | |
307 | string_type name; | |
308 | ||
309 | detail::sequence<BidiIter> seq, seq_end; | |
310 | FwdIter tmp = FwdIter(); | |
311 | ||
312 | syntax_option_type old_flags = this->traits_.flags(); | |
313 | ||
314 | switch(this->traits_.get_group_type(begin, end, name)) | |
315 | { | |
316 | case token_no_mark: | |
317 | // Don't process empty groups like (?:) or (?i) | |
318 | // BUGBUG this doesn't handle the degenerate (?:)+ correctly | |
319 | if(token_group_end == this->traits_.get_token(tmp = begin, end)) | |
320 | { | |
321 | return this->parse_atom(begin = tmp, end); | |
322 | } | |
323 | break; | |
324 | ||
325 | case token_negative_lookahead: | |
326 | negative = true; | |
327 | BOOST_FALLTHROUGH; | |
328 | case token_positive_lookahead: | |
329 | lookahead = true; | |
330 | break; | |
331 | ||
332 | case token_negative_lookbehind: | |
333 | negative = true; | |
334 | BOOST_FALLTHROUGH; | |
335 | case token_positive_lookbehind: | |
336 | lookbehind = true; | |
337 | break; | |
338 | ||
339 | case token_independent_sub_expression: | |
340 | keeper = true; | |
341 | break; | |
342 | ||
343 | case token_comment: | |
344 | while(BOOST_XPR_ENSURE_(begin != end, error_paren, "mismatched parenthesis")) | |
345 | { | |
346 | switch(this->traits_.get_token(begin, end)) | |
347 | { | |
348 | case token_group_end: | |
349 | return this->parse_atom(begin, end); | |
350 | case token_escape: | |
351 | BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence"); | |
352 | BOOST_FALLTHROUGH; | |
353 | case token_literal: | |
354 | ++begin; | |
355 | break; | |
356 | default: | |
357 | break; | |
358 | } | |
359 | } | |
360 | break; | |
361 | ||
362 | case token_recurse: | |
363 | BOOST_XPR_ENSURE_ | |
364 | ( | |
365 | begin != end && token_group_end == this->traits_.get_token(begin, end) | |
366 | , error_paren | |
367 | , "mismatched parenthesis" | |
368 | ); | |
369 | return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(this->self_)); | |
370 | ||
371 | case token_rule_assign: | |
372 | BOOST_THROW_EXCEPTION( | |
373 | regex_error(error_badrule, "rule assignments must be at the front of the regex") | |
374 | ); | |
375 | break; | |
376 | ||
377 | case token_rule_ref: | |
378 | { | |
379 | typedef detail::core_access<BidiIter> access; | |
380 | BOOST_XPR_ENSURE_ | |
381 | ( | |
382 | begin != end && token_group_end == this->traits_.get_token(begin, end) | |
383 | , error_paren | |
384 | , "mismatched parenthesis" | |
385 | ); | |
386 | basic_regex<BidiIter> &rex = this->rules_[name]; | |
387 | shared_ptr<detail::regex_impl<BidiIter> > impl = access::get_regex_impl(rex); | |
388 | this->self_->track_reference(*impl); | |
389 | return detail::make_dynamic<BidiIter>(detail::regex_byref_matcher<BidiIter>(impl)); | |
390 | } | |
391 | ||
392 | case token_named_mark: | |
393 | mark_nbr = static_cast<int>(++this->mark_count_); | |
394 | for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) | |
395 | { | |
396 | BOOST_XPR_ENSURE_(this->self_->named_marks_[i].name_ != name, error_badmark, "named mark already exists"); | |
397 | } | |
398 | this->self_->named_marks_.push_back(detail::named_mark<char_type>(name, this->mark_count_)); | |
399 | seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr)); | |
400 | seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr)); | |
401 | break; | |
402 | ||
403 | case token_named_mark_ref: | |
404 | BOOST_XPR_ENSURE_ | |
405 | ( | |
406 | begin != end && token_group_end == this->traits_.get_token(begin, end) | |
407 | , error_paren | |
408 | , "mismatched parenthesis" | |
409 | ); | |
410 | for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) | |
411 | { | |
412 | if(this->self_->named_marks_[i].name_ == name) | |
413 | { | |
414 | mark_nbr = static_cast<int>(this->self_->named_marks_[i].mark_nbr_); | |
415 | return detail::make_backref_xpression<BidiIter> | |
416 | ( | |
417 | mark_nbr, this->traits_.flags(), this->rxtraits() | |
418 | ); | |
419 | } | |
420 | } | |
421 | BOOST_THROW_EXCEPTION(regex_error(error_badmark, "invalid named back-reference")); | |
422 | break; | |
423 | ||
424 | default: | |
425 | mark_nbr = static_cast<int>(++this->mark_count_); | |
426 | seq = detail::make_dynamic<BidiIter>(detail::mark_begin_matcher(mark_nbr)); | |
427 | seq_end = detail::make_dynamic<BidiIter>(detail::mark_end_matcher(mark_nbr)); | |
428 | break; | |
429 | } | |
430 | ||
431 | // alternates | |
432 | seq += this->parse_alternates(begin, end); | |
433 | seq += seq_end; | |
434 | BOOST_XPR_ENSURE_ | |
435 | ( | |
436 | begin != end && token_group_end == this->traits_.get_token(begin, end) | |
437 | , error_paren | |
438 | , "mismatched parenthesis" | |
439 | ); | |
440 | ||
441 | typedef detail::shared_matchable<BidiIter> xpr_type; | |
442 | if(lookahead) | |
443 | { | |
444 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); | |
445 | detail::lookahead_matcher<xpr_type> lam(seq.xpr(), negative, seq.pure()); | |
446 | seq = detail::make_dynamic<BidiIter>(lam); | |
447 | } | |
448 | else if(lookbehind) | |
449 | { | |
450 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); | |
451 | detail::lookbehind_matcher<xpr_type> lbm(seq.xpr(), seq.width().value(), negative, seq.pure()); | |
452 | seq = detail::make_dynamic<BidiIter>(lbm); | |
453 | } | |
454 | else if(keeper) // independent sub-expression | |
455 | { | |
456 | seq += detail::make_independent_end_xpression<BidiIter>(seq.pure()); | |
457 | detail::keeper_matcher<xpr_type> km(seq.xpr(), seq.pure()); | |
458 | seq = detail::make_dynamic<BidiIter>(km); | |
459 | } | |
460 | ||
461 | // restore the modifiers | |
462 | this->traits_.flags(old_flags); | |
463 | return seq; | |
464 | } | |
465 | ||
466 | /////////////////////////////////////////////////////////////////////////// | |
467 | // parse_charset | |
468 | /// INTERNAL ONLY | |
469 | template<typename FwdIter> | |
470 | detail::sequence<BidiIter> parse_charset(FwdIter &begin, FwdIter end) | |
471 | { | |
472 | detail::compound_charset<traits_type> chset; | |
473 | ||
474 | // call out to a helper to actually parse the character set | |
475 | detail::parse_charset(begin, end, chset, this->traits_); | |
476 | ||
477 | return detail::make_charset_xpression<BidiIter> | |
478 | ( | |
479 | chset | |
480 | , this->rxtraits() | |
481 | , this->traits_.flags() | |
482 | ); | |
483 | } | |
484 | ||
485 | /////////////////////////////////////////////////////////////////////////// | |
486 | // parse_atom | |
487 | /// INTERNAL ONLY | |
488 | template<typename FwdIter> | |
489 | detail::sequence<BidiIter> parse_atom(FwdIter &begin, FwdIter end) | |
490 | { | |
491 | using namespace regex_constants; | |
492 | escape_value esc = { 0, 0, 0, detail::escape_char }; | |
493 | FwdIter old_begin = begin; | |
494 | ||
495 | switch(this->traits_.get_token(begin, end)) | |
496 | { | |
497 | case token_literal: | |
498 | return detail::make_literal_xpression<BidiIter> | |
499 | ( | |
500 | this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits() | |
501 | ); | |
502 | ||
503 | case token_any: | |
504 | return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits()); | |
505 | ||
506 | case token_assert_begin_sequence: | |
507 | return detail::make_dynamic<BidiIter>(detail::assert_bos_matcher()); | |
508 | ||
509 | case token_assert_end_sequence: | |
510 | return detail::make_dynamic<BidiIter>(detail::assert_eos_matcher()); | |
511 | ||
512 | case token_assert_begin_line: | |
513 | return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits()); | |
514 | ||
515 | case token_assert_end_line: | |
516 | return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits()); | |
517 | ||
518 | case token_assert_word_boundary: | |
519 | return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::true_>(), this->rxtraits()); | |
520 | ||
521 | case token_assert_not_word_boundary: | |
522 | return detail::make_assert_word<BidiIter>(detail::word_boundary<mpl::false_>(), this->rxtraits()); | |
523 | ||
524 | case token_assert_word_begin: | |
525 | return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits()); | |
526 | ||
527 | case token_assert_word_end: | |
528 | return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits()); | |
529 | ||
530 | case token_escape: | |
531 | esc = this->parse_escape(begin, end); | |
532 | switch(esc.type_) | |
533 | { | |
534 | case detail::escape_mark: | |
535 | return detail::make_backref_xpression<BidiIter> | |
536 | ( | |
537 | esc.mark_nbr_, this->traits_.flags(), this->rxtraits() | |
538 | ); | |
539 | case detail::escape_char: | |
540 | return detail::make_char_xpression<BidiIter> | |
541 | ( | |
542 | esc.ch_, this->traits_.flags(), this->rxtraits() | |
543 | ); | |
544 | case detail::escape_class: | |
545 | return detail::make_posix_charset_xpression<BidiIter> | |
546 | ( | |
547 | esc.class_ | |
548 | , this->is_upper_(*begin++) | |
549 | , this->traits_.flags() | |
550 | , this->rxtraits() | |
551 | ); | |
552 | } | |
553 | ||
554 | case token_group_begin: | |
555 | return this->parse_group(begin, end); | |
556 | ||
557 | case token_charset_begin: | |
558 | return this->parse_charset(begin, end); | |
559 | ||
560 | case token_invalid_quantifier: | |
561 | BOOST_THROW_EXCEPTION(regex_error(error_badrepeat, "quantifier not expected")); | |
562 | break; | |
563 | ||
564 | case token_quote_meta_begin: | |
565 | return detail::make_literal_xpression<BidiIter> | |
566 | ( | |
567 | this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits() | |
568 | ); | |
569 | ||
570 | case token_quote_meta_end: | |
571 | BOOST_THROW_EXCEPTION( | |
572 | regex_error( | |
573 | error_escape | |
574 | , "found quote-meta end without corresponding quote-meta begin" | |
575 | ) | |
576 | ); | |
577 | break; | |
578 | ||
579 | case token_end_of_pattern: | |
580 | break; | |
581 | ||
582 | default: | |
583 | begin = old_begin; | |
584 | break; | |
585 | } | |
586 | ||
587 | return detail::sequence<BidiIter>(); | |
588 | } | |
589 | ||
590 | /////////////////////////////////////////////////////////////////////////// | |
591 | // parse_quant | |
592 | /// INTERNAL ONLY | |
593 | template<typename FwdIter> | |
594 | detail::sequence<BidiIter> parse_quant(FwdIter &begin, FwdIter end) | |
595 | { | |
596 | BOOST_ASSERT(begin != end); | |
597 | detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; | |
598 | detail::sequence<BidiIter> seq = this->parse_atom(begin, end); | |
599 | ||
600 | // BUGBUG this doesn't handle the degenerate (?:)+ correctly | |
601 | if(!seq.empty() && begin != end && detail::quant_none != seq.quant()) | |
602 | { | |
603 | if(this->traits_.get_quant_spec(begin, end, spec)) | |
604 | { | |
605 | BOOST_ASSERT(spec.min_ <= spec.max_); | |
606 | ||
607 | if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing. | |
608 | { | |
609 | seq = this->parse_quant(begin, end); | |
610 | } | |
611 | else | |
612 | { | |
613 | seq.repeat(spec); | |
614 | } | |
615 | } | |
616 | } | |
617 | ||
618 | return seq; | |
619 | } | |
620 | ||
621 | /////////////////////////////////////////////////////////////////////////// | |
622 | // parse_sequence | |
623 | /// INTERNAL ONLY | |
624 | template<typename FwdIter> | |
625 | detail::sequence<BidiIter> parse_sequence(FwdIter &begin, FwdIter end) | |
626 | { | |
627 | detail::sequence<BidiIter> seq; | |
628 | ||
629 | while(begin != end) | |
630 | { | |
631 | detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end); | |
632 | ||
633 | // did we find a quantified atom? | |
634 | if(seq_quant.empty()) | |
635 | break; | |
636 | ||
637 | // chain it to the end of the xpression sequence | |
638 | seq += seq_quant; | |
639 | } | |
640 | ||
641 | return seq; | |
642 | } | |
643 | ||
644 | /////////////////////////////////////////////////////////////////////////// | |
645 | // parse_literal | |
646 | // scan ahead looking for char literals to be globbed together into a string literal | |
647 | /// INTERNAL ONLY | |
648 | template<typename FwdIter> | |
649 | string_type parse_literal(FwdIter &begin, FwdIter end) | |
650 | { | |
651 | using namespace regex_constants; | |
652 | BOOST_ASSERT(begin != end); | |
653 | BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end)); | |
654 | escape_value esc = { 0, 0, 0, detail::escape_char }; | |
655 | string_type literal(1, *begin); | |
656 | ||
657 | for(FwdIter prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp) | |
658 | { | |
659 | detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; | |
660 | if(this->traits_.get_quant_spec(tmp, end, spec)) | |
661 | { | |
662 | if(literal.size() != 1) | |
663 | { | |
664 | begin = prev; | |
665 | literal.erase(boost::prior(literal.end())); | |
666 | } | |
667 | return literal; | |
668 | } | |
669 | else switch(this->traits_.get_token(tmp, end)) | |
670 | { | |
671 | case token_escape: | |
672 | esc = this->parse_escape(tmp, end); | |
673 | if(detail::escape_char != esc.type_) return literal; | |
674 | literal.insert(literal.end(), esc.ch_); | |
675 | break; | |
676 | case token_literal: | |
677 | literal.insert(literal.end(), *tmp++); | |
678 | break; | |
679 | default: | |
680 | return literal; | |
681 | } | |
682 | } | |
683 | ||
684 | return literal; | |
685 | } | |
686 | ||
687 | /////////////////////////////////////////////////////////////////////////// | |
688 | // parse_quote_meta | |
689 | // scan ahead looking for char literals to be globbed together into a string literal | |
690 | /// INTERNAL ONLY | |
691 | template<typename FwdIter> | |
692 | string_type parse_quote_meta(FwdIter &begin, FwdIter end) | |
693 | { | |
694 | using namespace regex_constants; | |
695 | FwdIter old_begin = begin, old_end; | |
696 | while(end != (old_end = begin)) | |
697 | { | |
698 | switch(this->traits_.get_token(begin, end)) | |
699 | { | |
700 | case token_quote_meta_end: | |
701 | return string_type(old_begin, old_end); | |
702 | case token_escape: | |
703 | BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence"); | |
704 | BOOST_FALLTHROUGH; | |
705 | case token_invalid_quantifier: | |
706 | case token_literal: | |
707 | ++begin; | |
708 | break; | |
709 | default: | |
710 | break; | |
711 | } | |
712 | } | |
713 | return string_type(old_begin, begin); | |
714 | } | |
715 | ||
716 | /////////////////////////////////////////////////////////////////////////////// | |
717 | // parse_escape | |
718 | /// INTERNAL ONLY | |
719 | template<typename FwdIter> | |
720 | escape_value parse_escape(FwdIter &begin, FwdIter end) | |
721 | { | |
722 | BOOST_XPR_ENSURE_(begin != end, regex_constants::error_escape, "incomplete escape sequence"); | |
723 | ||
724 | // first, check to see if this can be a backreference | |
725 | if(0 < this->rxtraits().value(*begin, 10)) | |
726 | { | |
727 | // Parse at most 3 decimal digits. | |
728 | FwdIter tmp = begin; | |
729 | int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999); | |
730 | ||
731 | // If the resulting number could conceivably be a backref, then it is. | |
732 | if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_)) | |
733 | { | |
734 | begin = tmp; | |
735 | escape_value esc = {0, mark_nbr, 0, detail::escape_mark}; | |
736 | return esc; | |
737 | } | |
738 | } | |
739 | ||
740 | // Not a backreference, defer to the parse_escape helper | |
741 | return detail::parse_escape(begin, end, this->traits_); | |
742 | } | |
743 | ||
744 | bool is_upper_(char_type ch) const | |
745 | { | |
746 | return 0 != this->upper_ && this->rxtraits().isctype(ch, this->upper_); | |
747 | } | |
748 | ||
749 | std::size_t mark_count_; | |
750 | std::size_t hidden_mark_count_; | |
751 | CompilerTraits traits_; | |
752 | typename RegexTraits::char_class_type upper_; | |
753 | shared_ptr<detail::regex_impl<BidiIter> > self_; | |
754 | std::map<string_type, basic_regex<BidiIter> > rules_; | |
755 | }; | |
756 | ||
757 | }} // namespace boost::xpressive | |
758 | ||
759 | #endif |