6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
12 #ifndef BOOST_REGEX_MATCHER_HPP
13 #define BOOST_REGEX_MATCHER_HPP
15 #include <boost/regex/v4/iterator_category.hpp>
19 #pragma warning(disable: 4103)
21 #ifdef BOOST_HAS_ABI_HEADERS
22 # include BOOST_ABI_PREFIX
29 # pragma warning(push)
30 #pragma warning(disable : 4251)
32 # pragma warning(disable : 4231)
34 # if BOOST_MSVC < 1600
35 # pragma warning(disable : 4660)
38 #pragma warning(disable:4800)
43 namespace BOOST_REGEX_DETAIL_NS{
46 // error checking API:
48 BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf);
50 // function can_start:
52 template <class charT>
53 inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
55 return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
57 inline bool can_start(char c, const unsigned char* map, unsigned char mask)
59 return map[(unsigned char)c] & mask;
61 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
63 return map[(unsigned char)c] & mask;
65 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
69 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
71 return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
73 #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
74 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
75 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
77 return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
81 #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
82 inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
84 return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
90 // Unfortunately Rogue Waves standard library appears to have a bug
91 // in std::basic_string::compare that results in eroneous answers
92 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
93 // 0x020101) the test case was:
94 // {39135,0} < {0xff,0}
95 // which succeeds when it should not.
98 template <class C, class T, class A>
99 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
103 if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
109 template <class C, class T, class A>
110 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
114 if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
119 inline int string_compare(const std::string& s, const char* p)
120 { return std::strcmp(s.c_str(), p); }
121 # ifndef BOOST_NO_WREGEX
122 inline int string_compare(const std::wstring& s, const wchar_t* p)
123 { return std::wcscmp(s.c_str(), p); }
126 template <class Seq, class C>
127 inline int string_compare(const Seq& s, const C* p)
130 while((i < s.size()) && (p[i] == s[i]))
134 return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i];
136 # define STR_COMP(s,p) string_compare(s,p)
138 template<class charT>
139 inline const charT* re_skip_past_null(const charT* p)
141 while (*p != static_cast<charT>(0)) ++p;
145 template <class iterator, class charT, class traits_type, class char_classT>
146 iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
148 const re_set_long<char_classT>* set_,
149 const regex_data<charT, traits_type>& e, bool icase)
151 const charT* p = reinterpret_cast<const charT*>(set_+1);
154 //bool icase = e.m_flags & regex_constants::icase;
156 if(next == last) return next;
158 typedef typename traits_type::string_type traits_string_type;
159 const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
161 // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
165 // try and match a single character, could be a multi-character
166 // collating element...
167 for(i = 0; i < set_->csingles; ++i)
170 if(*p == static_cast<charT>(0))
172 // treat null string as special case:
173 if(traits_inst.translate(*ptr, icase))
178 return set_->isnot ? next : (ptr == next) ? ++next : ptr;
182 while(*p && (ptr != last))
184 if(traits_inst.translate(*ptr, icase) != *p)
190 if(*p == static_cast<charT>(0)) // if null we've matched
191 return set_->isnot ? next : (ptr == next) ? ++next : ptr;
193 p = re_skip_past_null(p); // skip null
197 charT col = traits_inst.translate(*next, icase);
200 if(set_->cranges || set_->cequivalents)
202 traits_string_type s1;
204 // try and match a range, NB only a single character can match
207 if((e.m_flags & regex_constants::collate) == 0)
211 charT a[2] = { col, charT(0), };
212 s1 = traits_inst.transform(a, a + 1);
214 for(i = 0; i < set_->cranges; ++i)
216 if(STR_COMP(s1, p) >= 0)
220 if(STR_COMP(s1, p) <= 0)
221 return set_->isnot ? next : ++next;
229 // skip second string
235 // try and match an equivalence class, NB only a single character can match
236 if(set_->cequivalents)
238 charT a[2] = { col, charT(0), };
239 s1 = traits_inst.transform_primary(a, a +1);
240 for(i = 0; i < set_->cequivalents; ++i)
242 if(STR_COMP(s1, p) == 0)
243 return set_->isnot ? next : ++next;
250 if(traits_inst.isctype(col, set_->cclasses) == true)
251 return set_->isnot ? next : ++next;
252 if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
253 return set_->isnot ? next : ++next;
254 return set_->isnot ? ++next : next;
257 template <class BidiIterator>
260 repeater_count** stack;
261 repeater_count* next;
263 std::size_t count; // the number of iterations so far
264 BidiIterator start_pos; // where the last repeat started
266 repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
268 while(p && (p->state_id != n))
270 if(-2 - current_recursion_id == p->state_id)
273 if(p && (p->state_id < 0))
275 p = unwind_until(p->state_id, p, current_recursion_id);
284 repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
286 repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
293 if((state_id > next->state_id) && (next->state_id >= 0))
297 repeater_count* p = next;
298 p = unwind_until(state_id, p, current_recursion_id);
302 start_pos = p->start_pos;
313 std::size_t get_count() { return count; }
314 int get_id() { return state_id; }
315 std::size_t operator++() { return ++count; }
316 bool check_null_repeat(const BidiIterator& pos, std::size_t max)
318 // this is called when we are about to start a new repeat,
319 // if the last one was NULL move our count to max,
320 // otherwise save the current position.
321 bool result = (count == 0) ? false : (pos == start_pos);
332 enum saved_state_type
335 saved_type_paren = 1,
336 saved_type_recurse = 2,
337 saved_type_assertion = 3,
339 saved_state_repeater_count = 5,
340 saved_state_extra_block = 6,
341 saved_state_greedy_single_repeat = 7,
342 saved_state_rep_slow_dot = 8,
343 saved_state_rep_fast_dot = 9,
344 saved_state_rep_char = 10,
345 saved_state_rep_short_set = 11,
346 saved_state_rep_long_set = 12,
347 saved_state_non_greedy_long_repeat = 13,
348 saved_state_count = 14
352 # pragma warning(push)
353 #if BOOST_MSVC >= 1800
354 #pragma warning(disable:26495)
357 template <class Results>
358 struct recursion_info
360 typedef typename Results::value_type value_type;
361 typedef typename value_type::iterator iterator;
363 const re_syntax_base* preturn_address;
365 repeater_count<iterator>* repeater_stack;
366 iterator location_of_start;
369 # pragma warning(pop)
372 template <class BidiIterator, class Allocator, class traits>
376 typedef typename traits::char_type char_type;
377 typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
378 typedef bool (self_type::*matcher_proc_type)(void);
379 typedef std::size_t traits_size_type;
380 typedef typename is_byte<char_type>::width_type width_type;
381 typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
382 typedef match_results<BidiIterator, Allocator> results_type;
384 perl_matcher(BidiIterator first, BidiIterator end,
385 match_results<BidiIterator, Allocator>& what,
386 const basic_regex<char_type, traits>& e,
389 : m_result(what), base(first), last(end),
390 position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
391 m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
392 #ifdef BOOST_REGEX_NON_RECURSIVE
396 construct_init(e, f);
402 void setf(match_flag_type f)
403 { m_match_flags |= f; }
404 void unsetf(match_flag_type f)
405 { m_match_flags &= ~f; }
408 void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
412 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
413 typedef bool (perl_matcher::*protected_proc_type)();
414 bool protected_call(protected_proc_type);
416 void estimate_max_state_count(std::random_access_iterator_tag*);
417 void estimate_max_state_count(void*);
419 bool match_all_states();
421 // match procs, stored in s_match_vtable:
422 bool match_startmark();
423 bool match_endmark();
424 bool match_literal();
425 bool match_start_line();
426 bool match_end_line();
429 bool match_word_boundary();
430 bool match_within_word();
431 bool match_word_start();
432 bool match_word_end();
433 bool match_buffer_start();
434 bool match_buffer_end();
435 bool match_backref();
436 bool match_long_set();
441 bool match_combining();
442 bool match_soft_buffer_end();
443 bool match_restart_continue();
444 bool match_long_set_repeat();
445 bool match_set_repeat();
446 bool match_char_repeat();
447 bool match_dot_repeat_fast();
448 bool match_dot_repeat_slow();
449 bool match_dot_repeat_dispatch()
451 return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
453 bool match_backstep();
454 bool match_assert_backref();
455 bool match_toggle_case();
456 #ifdef BOOST_REGEX_RECURSIVE
457 bool backtrack_till_match(std::size_t count);
459 bool match_recursion();
464 bool skip_until_paren(int index, bool match = true);
466 // find procs stored in s_find_vtable:
467 bool find_restart_any();
468 bool find_restart_word();
469 bool find_restart_line();
470 bool find_restart_buf();
471 bool find_restart_lit();
474 // final result structure to be filled in:
475 match_results<BidiIterator, Allocator>& m_result;
476 // temporary result for POSIX matches:
477 scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
478 // pointer to actual result structure to fill in:
479 match_results<BidiIterator, Allocator>* m_presult;
480 // start of sequence being searched:
482 // end of sequence being searched:
484 // current character being examined:
485 BidiIterator position;
486 // where to restart next search after failed match attempt:
487 BidiIterator restart;
488 // where the current search started from, acts as base for $` during grep:
489 BidiIterator search_base;
490 // how far we can go back when matching lookbehind:
491 BidiIterator backstop;
492 // the expression being examined:
493 const basic_regex<char_type, traits>& re;
494 // the expression's traits class:
495 const ::boost::regex_traits_wrapper<traits>& traits_inst;
496 // the next state in the machine being matched:
497 const re_syntax_base* pstate;
498 // matching flags in use:
499 match_flag_type m_match_flags;
500 // how many states we have examined so far:
501 std::ptrdiff_t state_count;
502 // max number of states to examine before giving up:
503 std::ptrdiff_t max_state_count;
504 // whether we should ignore case or not:
506 // set to true when (position == last), indicates that we may have a partial match:
507 bool m_has_partial_match;
508 // set to true whenever we get a match:
509 bool m_has_found_match;
510 // set to true whenever we're inside an independent sub-expression:
512 // the current repeat being examined:
513 repeater_count<BidiIterator>* next_count;
514 // the first repeat being examined (top of linked list):
515 repeater_count<BidiIterator> rep_obj;
516 // the mask to pass when matching word boundaries:
517 typename traits::char_class_type m_word_mask;
518 // the bitmask to use when determining whether a match_any matches a newline or not:
519 unsigned char match_any_mask;
520 // recursion information:
521 std::vector<recursion_info<results_type> > recursion_stack;
522 #ifdef BOOST_REGEX_RECURSIVE
523 // Set to false by a (*COMMIT):
524 bool m_can_backtrack;
528 #ifdef BOOST_REGEX_NON_RECURSIVE
530 // additional members for non-recursive version:
532 typedef bool (self_type::*unwind_proc_type)(bool);
536 bool unwind_end(bool);
537 bool unwind_paren(bool);
538 bool unwind_recursion_stopper(bool);
539 bool unwind_assertion(bool);
540 bool unwind_alt(bool);
541 bool unwind_repeater_counter(bool);
542 bool unwind_extra_block(bool);
543 bool unwind_greedy_single_repeat(bool);
544 bool unwind_slow_dot_repeat(bool);
545 bool unwind_fast_dot_repeat(bool);
546 bool unwind_char_repeat(bool);
547 bool unwind_short_set_repeat(bool);
548 bool unwind_long_set_repeat(bool);
549 bool unwind_non_greedy_repeat(bool);
550 bool unwind_recursion(bool);
551 bool unwind_recursion_pop(bool);
552 bool unwind_commit(bool);
553 bool unwind_then(bool);
554 bool unwind_case(bool);
555 void destroy_single_repeat();
556 void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
557 void push_recursion_stopper();
558 void push_assertion(const re_syntax_base* ps, bool positive);
559 void push_alt(const re_syntax_base* ps);
560 void push_repeater_count(int i, repeater_count<BidiIterator>** s);
561 void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
562 void push_non_greedy_repeat(const re_syntax_base* ps);
563 void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2);
564 void push_recursion_pop();
565 void push_case_change(bool);
567 // pointer to base of stack:
568 saved_state* m_stack_base;
569 // pointer to current stack position:
570 saved_state* m_backup_state;
571 // how many memory blocks have we used up?:
572 unsigned used_block_count;
573 // determines what value to return when unwinding from recursion,
574 // allows for mixed recursive/non-recursive algorithm:
575 bool m_recursive_result;
576 // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
577 bool m_unwound_lookahead;
578 // We have unwound to an alternative, used by THEN:
580 // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
581 //bool m_unwind_commit;
583 unsigned m_recursions;
587 # pragma warning(push)
588 #if BOOST_MSVC >= 1800
589 #pragma warning(disable:26495)
592 // these operations aren't allowed, so are declared private,
593 // bodies are provided to keep explicit-instantiation requests happy:
594 perl_matcher& operator=(const perl_matcher&)
598 perl_matcher(const perl_matcher& that)
599 : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
601 # pragma warning(pop)
605 } // namespace BOOST_REGEX_DETAIL_NS
608 # pragma warning(pop)
612 #pragma warning(push)
613 #pragma warning(disable: 4103)
615 #ifdef BOOST_HAS_ABI_HEADERS
616 # include BOOST_ABI_SUFFIX
625 // include the implementation of perl_matcher:
627 #ifdef BOOST_REGEX_RECURSIVE
628 #include <boost/regex/v4/perl_matcher_recursive.hpp>
630 #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
632 // this one has to be last:
633 #include <boost/regex/v4/perl_matcher_common.hpp>