6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE basic_regex_creator.cpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Declares template class basic_regex_creator which fills in
17 * the data members of a regex_data object.
20 #ifndef BOOST_REGEX_V5_BASIC_REGEX_CREATOR_HPP
21 #define BOOST_REGEX_V5_BASIC_REGEX_CREATOR_HPP
23 #ifdef BOOST_REGEX_MSVC
24 # pragma warning(push)
25 #pragma warning(disable:4459)
26 #if BOOST_REGEX_MSVC < 1910
27 #pragma warning(disable:4800)
35 namespace BOOST_REGEX_DETAIL_NS{
37 template <class charT>
38 struct digraph : public std::pair<charT, charT>
40 digraph() : std::pair<charT, charT>(charT(0), charT(0)){}
41 digraph(charT c1) : std::pair<charT, charT>(c1, charT(0)){}
42 digraph(charT c1, charT c2) : std::pair<charT, charT>(c1, c2)
44 digraph(const digraph<charT>& d) : std::pair<charT, charT>(d.first, d.second){}
45 digraph<charT>& operator=(const digraph<charT>&) = default;
47 digraph(const Seq& s) : std::pair<charT, charT>()
49 BOOST_REGEX_ASSERT(s.size() <= 2);
50 BOOST_REGEX_ASSERT(s.size());
52 this->second = (s.size() > 1) ? s[1] : 0;
56 template <class charT, class traits>
60 typedef digraph<charT> digraph_type;
61 typedef typename traits::string_type string_type;
62 typedef typename traits::char_class_type m_type;
67 m_has_digraphs = false;
69 m_negated_classes = 0;
73 void add_single(const digraph_type& s)
77 m_has_digraphs = true;
80 void add_range(const digraph_type& first, const digraph_type& end)
82 m_ranges.push_back(first);
83 m_ranges.push_back(end);
86 m_has_digraphs = true;
91 m_has_digraphs = true;
96 void add_class(m_type m)
101 void add_negated_class(m_type m)
103 m_negated_classes |= m;
106 void add_equivalent(const digraph_type& s)
108 m_equivalents.insert(s);
111 m_has_digraphs = true;
123 // accessor functions:
125 bool has_digraphs()const
127 return m_has_digraphs;
129 bool is_negated()const
133 typedef typename std::vector<digraph_type>::const_iterator list_iterator;
134 typedef typename std::set<digraph_type>::const_iterator set_iterator;
135 set_iterator singles_begin()const
137 return m_singles.begin();
139 set_iterator singles_end()const
141 return m_singles.end();
143 list_iterator ranges_begin()const
145 return m_ranges.begin();
147 list_iterator ranges_end()const
149 return m_ranges.end();
151 set_iterator equivalents_begin()const
153 return m_equivalents.begin();
155 set_iterator equivalents_end()const
157 return m_equivalents.end();
159 m_type classes()const
163 m_type negated_classes()const
165 return m_negated_classes;
172 std::set<digraph_type> m_singles; // a list of single characters to match
173 std::vector<digraph_type> m_ranges; // a list of end points of our ranges
174 bool m_negate; // true if the set is to be negated
175 bool m_has_digraphs; // true if we have digraphs present
176 m_type m_classes; // character classes to match
177 m_type m_negated_classes; // negated character classes to match
178 bool m_empty; // whether we've added anything yet
179 std::set<digraph_type> m_equivalents; // a list of equivalence classes
182 template <class charT, class traits>
183 class basic_regex_creator
186 basic_regex_creator(regex_data<charT, traits>* data);
187 std::ptrdiff_t getoffset(void* addr)
189 return getoffset(addr, m_pdata->m_data.data());
191 std::ptrdiff_t getoffset(const void* addr, const void* base)
193 return static_cast<const char*>(addr) - static_cast<const char*>(base);
195 re_syntax_base* getaddress(std::ptrdiff_t off)
197 return getaddress(off, m_pdata->m_data.data());
199 re_syntax_base* getaddress(std::ptrdiff_t off, void* base)
201 return static_cast<re_syntax_base*>(static_cast<void*>(static_cast<char*>(base) + off));
203 void init(unsigned l_flags)
205 m_pdata->m_flags = l_flags;
206 m_icase = l_flags & regex_constants::icase;
208 regbase::flag_type flags()
210 return m_pdata->m_flags;
212 void flags(regbase::flag_type f)
214 m_pdata->m_flags = f;
215 if(m_icase != static_cast<bool>(f & regbase::icase))
217 m_icase = static_cast<bool>(f & regbase::icase);
220 re_syntax_base* append_state(syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
221 re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
222 re_literal* append_literal(charT c);
223 re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set);
224 re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, false>*);
225 re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, true>*);
226 void finalize(const charT* p1, const charT* p2);
228 regex_data<charT, traits>* m_pdata; // pointer to the basic_regex_data struct we are filling in
229 const ::boost::regex_traits_wrapper<traits>&
230 m_traits; // convenience reference to traits class
231 re_syntax_base* m_last_state; // the last state we added
232 bool m_icase; // true for case insensitive matches
233 unsigned m_repeater_id; // the state_id of the next repeater
234 bool m_has_backrefs; // true if there are actually any backrefs
235 std::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for;
236 bool m_has_recursions; // set when we have recursive expressions to fixup
237 std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression
238 typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character
239 typename traits::char_class_type m_mask_space; // mask used to determine if a character is a word character
240 typename traits::char_class_type m_lower_mask; // mask used to determine if a character is a lowercase character
241 typename traits::char_class_type m_upper_mask; // mask used to determine if a character is an uppercase character
242 typename traits::char_class_type m_alpha_mask; // mask used to determine if a character is an alphabetic character
244 basic_regex_creator& operator=(const basic_regex_creator&);
245 basic_regex_creator(const basic_regex_creator&);
247 void fixup_pointers(re_syntax_base* state);
248 void fixup_recursions(re_syntax_base* state);
249 void create_startmaps(re_syntax_base* state);
250 int calculate_backstep(re_syntax_base* state);
251 void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask);
252 unsigned get_restart_type(re_syntax_base* state);
253 void set_all_masks(unsigned char* bits, unsigned char);
254 bool is_bad_repeat(re_syntax_base* pt);
255 void set_bad_repeat(re_syntax_base* pt);
256 syntax_element_type get_repeat_type(re_syntax_base* state);
257 void probe_leading_repeat(re_syntax_base* state);
260 template <class charT, class traits>
261 basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
262 : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_icase(false), m_repeater_id(0),
263 m_has_backrefs(false), m_bad_repeats(0), m_has_recursions(false), m_word_mask(0), m_mask_space(0), m_lower_mask(0), m_upper_mask(0), m_alpha_mask(0)
265 m_pdata->m_data.clear();
266 m_pdata->m_status = ::boost::regex_constants::error_ok;
267 static const charT w = 'w';
268 static const charT s = 's';
269 static const charT l[5] = { 'l', 'o', 'w', 'e', 'r', };
270 static const charT u[5] = { 'u', 'p', 'p', 'e', 'r', };
271 static const charT a[5] = { 'a', 'l', 'p', 'h', 'a', };
272 m_word_mask = m_traits.lookup_classname(&w, &w +1);
273 m_mask_space = m_traits.lookup_classname(&s, &s +1);
274 m_lower_mask = m_traits.lookup_classname(l, l + 5);
275 m_upper_mask = m_traits.lookup_classname(u, u + 5);
276 m_alpha_mask = m_traits.lookup_classname(a, a + 5);
277 m_pdata->m_word_mask = m_word_mask;
278 BOOST_REGEX_ASSERT(m_word_mask != 0);
279 BOOST_REGEX_ASSERT(m_mask_space != 0);
280 BOOST_REGEX_ASSERT(m_lower_mask != 0);
281 BOOST_REGEX_ASSERT(m_upper_mask != 0);
282 BOOST_REGEX_ASSERT(m_alpha_mask != 0);
285 template <class charT, class traits>
286 re_syntax_base* basic_regex_creator<charT, traits>::append_state(syntax_element_type t, std::size_t s)
288 // if the state is a backref then make a note of it:
289 if(t == syntax_element_backref)
290 this->m_has_backrefs = true;
291 // append a new state, start by aligning our last one:
292 m_pdata->m_data.align();
293 // set the offset to the next state in our last one:
295 m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
296 // now actually extend our data:
297 m_last_state = static_cast<re_syntax_base*>(m_pdata->m_data.extend(s));
298 // fill in boilerplate options in the new state:
299 m_last_state->next.i = 0;
300 m_last_state->type = t;
304 template <class charT, class traits>
305 re_syntax_base* basic_regex_creator<charT, traits>::insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s)
307 // append a new state, start by aligning our last one:
308 m_pdata->m_data.align();
309 // set the offset to the next state in our last one:
311 m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
312 // remember the last state position:
313 std::ptrdiff_t off = getoffset(m_last_state) + s;
314 // now actually insert our data:
315 re_syntax_base* new_state = static_cast<re_syntax_base*>(m_pdata->m_data.insert(pos, s));
316 // fill in boilerplate options in the new state:
317 new_state->next.i = s;
319 m_last_state = getaddress(off);
323 template <class charT, class traits>
324 re_literal* basic_regex_creator<charT, traits>::append_literal(charT c)
327 // start by seeing if we have an existing re_literal we can extend:
328 if((0 == m_last_state) || (m_last_state->type != syntax_element_literal))
330 // no existing re_literal, create a new one:
331 result = static_cast<re_literal*>(append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
333 *static_cast<charT*>(static_cast<void*>(result+1)) = m_traits.translate(c, m_icase);
337 // we have an existing re_literal, extend it:
338 std::ptrdiff_t off = getoffset(m_last_state);
339 m_pdata->m_data.extend(sizeof(charT));
340 m_last_state = result = static_cast<re_literal*>(getaddress(off));
341 charT* characters = static_cast<charT*>(static_cast<void*>(result+1));
342 characters[result->length] = m_traits.translate(c, m_icase);
348 template <class charT, class traits>
349 inline re_syntax_base* basic_regex_creator<charT, traits>::append_set(
350 const basic_char_set<charT, traits>& char_set)
352 typedef std::integral_constant<bool, (sizeof(charT) == 1) > truth_type;
353 return char_set.has_digraphs()
354 ? append_set(char_set, static_cast<std::integral_constant<bool, false>*>(0))
355 : append_set(char_set, static_cast<truth_type*>(0));
358 template <class charT, class traits>
359 re_syntax_base* basic_regex_creator<charT, traits>::append_set(
360 const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, false>*)
362 typedef typename traits::string_type string_type;
363 typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
364 typedef typename basic_char_set<charT, traits>::set_iterator set_iterator;
365 typedef typename traits::char_class_type m_type;
367 re_set_long<m_type>* result = static_cast<re_set_long<m_type>*>(append_state(syntax_element_long_set, sizeof(re_set_long<m_type>)));
369 // fill in the basics:
371 result->csingles = static_cast<unsigned int>(std::distance(char_set.singles_begin(), char_set.singles_end()));
372 result->cranges = static_cast<unsigned int>(std::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2;
373 result->cequivalents = static_cast<unsigned int>(std::distance(char_set.equivalents_begin(), char_set.equivalents_end()));
374 result->cclasses = char_set.classes();
375 result->cnclasses = char_set.negated_classes();
376 if(flags() & regbase::icase)
378 // adjust classes as needed:
379 if(((result->cclasses & m_lower_mask) == m_lower_mask) || ((result->cclasses & m_upper_mask) == m_upper_mask))
380 result->cclasses |= m_alpha_mask;
381 if(((result->cnclasses & m_lower_mask) == m_lower_mask) || ((result->cnclasses & m_upper_mask) == m_upper_mask))
382 result->cnclasses |= m_alpha_mask;
385 result->isnot = char_set.is_negated();
386 result->singleton = !char_set.has_digraphs();
388 // remember where the state is for later:
390 std::ptrdiff_t offset = getoffset(result);
392 // now extend with all the singles:
394 item_iterator first, last;
395 set_iterator sfirst, slast;
396 sfirst = char_set.singles_begin();
397 slast = char_set.singles_end();
398 while(sfirst != slast)
400 charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (sfirst->first == static_cast<charT>(0) ? 1 : sfirst->second ? 3 : 2)));
401 p[0] = m_traits.translate(sfirst->first, m_icase);
402 if(sfirst->first == static_cast<charT>(0))
406 else if(sfirst->second)
408 p[1] = m_traits.translate(sfirst->second, m_icase);
416 // now extend with all the ranges:
418 first = char_set.ranges_begin();
419 last = char_set.ranges_end();
422 // first grab the endpoints of the range:
423 digraph<charT> c1 = *first;
424 c1.first = this->m_traits.translate(c1.first, this->m_icase);
425 c1.second = this->m_traits.translate(c1.second, this->m_icase);
427 digraph<charT> c2 = *first;
428 c2.first = this->m_traits.translate(c2.first, this->m_icase);
429 c2.second = this->m_traits.translate(c2.second, this->m_icase);
432 // different actions now depending upon whether collation is turned on:
433 if(flags() & regex_constants::collate)
435 // we need to transform our range into sort keys:
436 charT a1[3] = { c1.first, c1.second, charT(0), };
437 charT a2[3] = { c2.first, c2.second, charT(0), };
438 s1 = this->m_traits.transform(a1, (a1[1] ? a1+2 : a1+1));
439 s2 = this->m_traits.transform(a2, (a2[1] ? a2+2 : a2+1));
441 s1 = string_type(1, charT(0));
443 s2 = string_type(1, charT(0));
449 s1.insert(s1.end(), c1.first);
450 s1.insert(s1.end(), c1.second);
453 s1 = string_type(1, c1.first);
456 s2.insert(s2.end(), c2.first);
457 s2.insert(s2.end(), c2.second);
460 s2.insert(s2.end(), c2.first);
467 charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s1.size() + s2.size() + 2) ) );
468 BOOST_REGEX_DETAIL_NS::copy(s1.begin(), s1.end(), p);
469 p[s1.size()] = charT(0);
471 BOOST_REGEX_DETAIL_NS::copy(s2.begin(), s2.end(), p);
472 p[s2.size()] = charT(0);
475 // now process the equivalence classes:
477 sfirst = char_set.equivalents_begin();
478 slast = char_set.equivalents_end();
479 while(sfirst != slast)
484 charT cs[3] = { sfirst->first, sfirst->second, charT(0), };
485 s = m_traits.transform_primary(cs, cs+2);
488 s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
490 return 0; // invalid or unsupported equivalence class
491 charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) );
492 BOOST_REGEX_DETAIL_NS::copy(s.begin(), s.end(), p);
493 p[s.size()] = charT(0);
497 // finally reset the address of our last state:
499 m_last_state = result = static_cast<re_set_long<m_type>*>(getaddress(offset));
504 inline bool char_less(T t1, T t2)
508 inline bool char_less(char t1, char t2)
510 return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
512 inline bool char_less(signed char t1, signed char t2)
514 return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
517 template <class charT, class traits>
518 re_syntax_base* basic_regex_creator<charT, traits>::append_set(
519 const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, true>*)
521 typedef typename traits::string_type string_type;
522 typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
523 typedef typename basic_char_set<charT, traits>::set_iterator set_iterator;
525 re_set* result = static_cast<re_set*>(append_state(syntax_element_set, sizeof(re_set)));
526 bool negate = char_set.is_negated();
527 std::memset(result->_map, 0, sizeof(result->_map));
529 // handle singles first:
531 item_iterator first, last;
532 set_iterator sfirst, slast;
533 sfirst = char_set.singles_begin();
534 slast = char_set.singles_end();
535 while(sfirst != slast)
537 for(unsigned int i = 0; i < (1 << CHAR_BIT); ++i)
539 if(this->m_traits.translate(static_cast<charT>(i), this->m_icase)
540 == this->m_traits.translate(sfirst->first, this->m_icase))
541 result->_map[i] = true;
546 // OK now handle ranges:
548 first = char_set.ranges_begin();
549 last = char_set.ranges_end();
552 // first grab the endpoints of the range:
553 charT c1 = this->m_traits.translate(first->first, this->m_icase);
555 charT c2 = this->m_traits.translate(first->first, this->m_icase);
557 // different actions now depending upon whether collation is turned on:
558 if(flags() & regex_constants::collate)
560 // we need to transform our range into sort keys:
561 charT c3[2] = { c1, charT(0), };
562 string_type s1 = this->m_traits.transform(c3, c3+1);
564 string_type s2 = this->m_traits.transform(c3, c3+1);
570 BOOST_REGEX_ASSERT(c3[1] == charT(0));
571 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
573 c3[0] = static_cast<charT>(i);
574 string_type s3 = this->m_traits.transform(c3, c3 +1);
575 if((s1 <= s3) && (s3 <= s2))
576 result->_map[i] = true;
581 if(char_less(c2, c1))
586 // everything in range matches:
587 std::memset(result->_map + static_cast<unsigned char>(c1), true, static_cast<unsigned char>(1u) + static_cast<unsigned char>(static_cast<unsigned char>(c2) - static_cast<unsigned char>(c1)));
591 // and now the classes:
593 typedef typename traits::char_class_type m_type;
594 m_type m = char_set.classes();
595 if(flags() & regbase::icase)
597 // adjust m as needed:
598 if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
603 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
605 if(this->m_traits.isctype(static_cast<charT>(i), m))
606 result->_map[i] = true;
610 // and now the negated classes:
612 m = char_set.negated_classes();
613 if(flags() & regbase::icase)
615 // adjust m as needed:
616 if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
621 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
623 if(0 == this->m_traits.isctype(static_cast<charT>(i), m))
624 result->_map[i] = true;
628 // now process the equivalence classes:
630 sfirst = char_set.equivalents_begin();
631 slast = char_set.equivalents_end();
632 while(sfirst != slast)
635 BOOST_REGEX_ASSERT(static_cast<charT>(0) == sfirst->second);
636 s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
638 return 0; // invalid or unsupported equivalence class
639 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
641 charT c[2] = { (static_cast<charT>(i)), charT(0), };
642 string_type s2 = this->m_traits.transform_primary(c, c+1);
644 result->_map[i] = true;
650 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
652 result->_map[i] = !(result->_map[i]);
658 template <class charT, class traits>
659 void basic_regex_creator<charT, traits>::finalize(const charT* p1, const charT* p2)
661 if(this->m_pdata->m_status)
663 // we've added all the states we need, now finish things off.
664 // start by adding a terminating state:
665 append_state(syntax_element_match);
666 // extend storage to store original expression:
667 std::ptrdiff_t len = p2 - p1;
668 m_pdata->m_expression_len = len;
669 charT* ps = static_cast<charT*>(m_pdata->m_data.extend(sizeof(charT) * (1 + (p2 - p1))));
670 m_pdata->m_expression = ps;
671 BOOST_REGEX_DETAIL_NS::copy(p1, p2, ps);
673 // fill in our other data...
674 // successful parsing implies a zero status:
675 m_pdata->m_status = 0;
676 // get the first state of the machine:
677 m_pdata->m_first_state = static_cast<re_syntax_base*>(m_pdata->m_data.data());
678 // fixup pointers in the machine:
679 fixup_pointers(m_pdata->m_first_state);
682 m_pdata->m_has_recursions = true;
683 fixup_recursions(m_pdata->m_first_state);
684 if(this->m_pdata->m_status)
688 m_pdata->m_has_recursions = false;
689 // create nested startmaps:
690 create_startmaps(m_pdata->m_first_state);
691 // create main startmap:
692 std::memset(m_pdata->m_startmap, 0, sizeof(m_pdata->m_startmap));
693 m_pdata->m_can_be_null = 0;
697 m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
698 create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all);
699 // get the restart type:
700 m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state);
701 // optimise a leading repeat if there is one:
702 probe_leading_repeat(m_pdata->m_first_state);
705 template <class charT, class traits>
706 void basic_regex_creator<charT, traits>::fixup_pointers(re_syntax_base* state)
712 case syntax_element_recurse:
713 m_has_recursions = true;
715 state->next.p = getaddress(state->next.i, state);
719 case syntax_element_rep:
720 case syntax_element_dot_rep:
721 case syntax_element_char_rep:
722 case syntax_element_short_set_rep:
723 case syntax_element_long_set_rep:
724 // set the state_id of this repeat:
725 static_cast<re_repeat*>(state)->state_id = m_repeater_id++;
726 BOOST_REGEX_FALLTHROUGH;
727 case syntax_element_alt:
728 std::memset(static_cast<re_alt*>(state)->_map, 0, sizeof(static_cast<re_alt*>(state)->_map));
729 static_cast<re_alt*>(state)->can_be_null = 0;
730 BOOST_REGEX_FALLTHROUGH;
731 case syntax_element_jump:
732 static_cast<re_jump*>(state)->alt.p = getaddress(static_cast<re_jump*>(state)->alt.i, state);
733 BOOST_REGEX_FALLTHROUGH;
736 state->next.p = getaddress(state->next.i, state);
740 state = state->next.p;
744 template <class charT, class traits>
745 void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
747 re_syntax_base* base = state;
752 case syntax_element_assert_backref:
754 // just check that the index is valid:
755 int idx = static_cast<const re_brace*>(state)->index;
759 if(idx >= hash_value_mask)
761 idx = m_pdata->get_id(idx);
764 // check of sub-expression that doesn't exist:
765 if(0 == this->m_pdata->m_status) // update the error code if not already set
766 this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
768 // clear the expression, we should be empty:
770 this->m_pdata->m_expression = 0;
771 this->m_pdata->m_expression_len = 0;
773 // and throw if required:
775 if(0 == (this->flags() & regex_constants::no_except))
777 std::string message = "Encountered a forward reference to a marked sub-expression that does not exist.";
778 boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
786 case syntax_element_recurse:
789 re_syntax_base* p = base;
790 std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
791 if(idx >= hash_value_mask)
794 // There may be more than one capture group with this hash, just do what Perl
795 // does and recurse to the leftmost:
797 idx = m_pdata->get_id(static_cast<int>(idx));
807 if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))
810 // We've found the target of the recursion, set the jump target:
812 static_cast<re_jump*>(state)->alt.p = p;
815 // Now scan the target for nested repeats:
823 case syntax_element_rep:
824 case syntax_element_dot_rep:
825 case syntax_element_char_rep:
826 case syntax_element_short_set_rep:
827 case syntax_element_long_set_rep:
828 next_rep_id = static_cast<re_repeat*>(p)->state_id;
830 case syntax_element_endmark:
831 if(static_cast<const re_brace*>(p)->index == idx)
843 static_cast<re_recurse*>(state)->state_id = next_rep_id - 1;
853 // recursion to sub-expression that doesn't exist:
854 if(0 == this->m_pdata->m_status) // update the error code if not already set
855 this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
857 // clear the expression, we should be empty:
859 this->m_pdata->m_expression = 0;
860 this->m_pdata->m_expression_len = 0;
862 // and throw if required:
864 if(0 == (this->flags() & regex_constants::no_except))
866 std::string message = "Encountered a forward reference to a recursive sub-expression that does not exist.";
867 boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
876 state = state->next.p;
880 template <class charT, class traits>
881 void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state)
883 // non-recursive implementation:
884 // create the last map in the machine first, so that earlier maps
885 // can make use of the result...
887 // This was originally a recursive implementation, but that caused stack
888 // overflows with complex expressions on small stacks (think COM+).
890 // start by saving the case setting:
891 bool l_icase = m_icase;
892 std::vector<std::pair<bool, re_syntax_base*> > v;
898 case syntax_element_toggle_case:
899 // we need to track case changes here:
900 m_icase = static_cast<re_case*>(state)->icase;
901 state = state->next.p;
903 case syntax_element_alt:
904 case syntax_element_rep:
905 case syntax_element_dot_rep:
906 case syntax_element_char_rep:
907 case syntax_element_short_set_rep:
908 case syntax_element_long_set_rep:
909 // just push the state onto our stack for now:
910 v.push_back(std::pair<bool, re_syntax_base*>(m_icase, state));
911 state = state->next.p;
913 case syntax_element_backstep:
914 // we need to calculate how big the backstep is:
915 static_cast<re_brace*>(state)->index
916 = this->calculate_backstep(state->next.p);
917 if(static_cast<re_brace*>(state)->index < 0)
920 if(0 == this->m_pdata->m_status) // update the error code if not already set
921 this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
923 // clear the expression, we should be empty:
925 this->m_pdata->m_expression = 0;
926 this->m_pdata->m_expression_len = 0;
928 // and throw if required:
930 if(0 == (this->flags() & regex_constants::no_except))
932 std::string message = "Invalid lookbehind assertion encountered in the regular expression.";
933 boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
937 BOOST_REGEX_FALLTHROUGH;
939 state = state->next.p;
943 // now work through our list, building all the maps as we go:
946 // Initialize m_recursion_checks if we need it:
948 m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
950 const std::pair<bool, re_syntax_base*>& p = v.back();
957 create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take);
961 m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
962 create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip);
963 // adjust the type of the state to allow for faster matching:
964 state->type = this->get_repeat_type(state);
966 // restore case sensitivity:
970 template <class charT, class traits>
971 int basic_regex_creator<charT, traits>::calculate_backstep(re_syntax_base* state)
973 typedef typename traits::char_class_type m_type;
979 case syntax_element_startmark:
980 if((static_cast<re_brace*>(state)->index == -1)
981 || (static_cast<re_brace*>(state)->index == -2))
983 state = static_cast<re_jump*>(state->next.p)->alt.p->next.p;
986 else if(static_cast<re_brace*>(state)->index == -3)
988 state = state->next.p->next.p;
992 case syntax_element_endmark:
993 if((static_cast<re_brace*>(state)->index == -1)
994 || (static_cast<re_brace*>(state)->index == -2))
997 case syntax_element_literal:
998 result += static_cast<re_literal*>(state)->length;
1000 case syntax_element_wild:
1001 case syntax_element_set:
1004 case syntax_element_dot_rep:
1005 case syntax_element_char_rep:
1006 case syntax_element_short_set_rep:
1007 case syntax_element_backref:
1008 case syntax_element_rep:
1009 case syntax_element_combining:
1010 case syntax_element_long_set_rep:
1011 case syntax_element_backstep:
1013 re_repeat* rep = static_cast<re_repeat *>(state);
1014 // adjust the type of the state to allow for faster matching:
1015 state->type = this->get_repeat_type(state);
1016 if((state->type == syntax_element_dot_rep)
1017 || (state->type == syntax_element_char_rep)
1018 || (state->type == syntax_element_short_set_rep))
1020 if(rep->max != rep->min)
1022 result += static_cast<int>(rep->min);
1026 else if(state->type == syntax_element_long_set_rep)
1028 BOOST_REGEX_ASSERT(rep->next.p->type == syntax_element_long_set);
1029 if(static_cast<re_set_long<m_type>*>(rep->next.p)->singleton == 0)
1031 if(rep->max != rep->min)
1033 result += static_cast<int>(rep->min);
1039 case syntax_element_long_set:
1040 if(static_cast<re_set_long<m_type>*>(state)->singleton == 0)
1044 case syntax_element_jump:
1045 state = static_cast<re_jump*>(state)->alt.p;
1047 case syntax_element_alt:
1049 int r1 = calculate_backstep(state->next.p);
1050 int r2 = calculate_backstep(static_cast<re_alt*>(state)->alt.p);
1051 if((r1 < 0) || (r1 != r2))
1058 state = state->next.p;
1063 struct recursion_saver
1065 std::vector<unsigned char> saved_state;
1066 std::vector<unsigned char>* state;
1067 recursion_saver(std::vector<unsigned char>* p) : saved_state(*p), state(p) {}
1070 state->swap(saved_state);
1074 template <class charT, class traits>
1075 void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask)
1077 recursion_saver saved_recursions(&m_recursion_checks);
1078 int not_last_jump = 1;
1079 re_syntax_base* recursion_start = 0;
1080 int recursion_sub = 0;
1081 re_syntax_base* recursion_restart = 0;
1083 // track case sensitivity:
1084 bool l_icase = m_icase;
1090 case syntax_element_toggle_case:
1091 l_icase = static_cast<re_case*>(state)->icase;
1092 state = state->next.p;
1094 case syntax_element_literal:
1096 // don't set anything in *pnull, set each element in l_map
1097 // that could match the first character in the literal:
1100 l_map[0] |= mask_init;
1101 charT first_char = *static_cast<charT*>(static_cast<void*>(static_cast<re_literal*>(state) + 1));
1102 for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1104 if(m_traits.translate(static_cast<charT>(i), l_icase) == first_char)
1110 case syntax_element_end_line:
1112 // next character must be a line separator (if there is one):
1115 l_map[0] |= mask_init;
1116 l_map[static_cast<unsigned>('\n')] |= mask;
1117 l_map[static_cast<unsigned>('\r')] |= mask;
1118 l_map[static_cast<unsigned>('\f')] |= mask;
1119 l_map[0x85] |= mask;
1121 // now figure out if we can match a NULL string at this point:
1123 create_startmap(state->next.p, 0, pnull, mask);
1126 case syntax_element_recurse:
1128 BOOST_REGEX_ASSERT(static_cast<const re_jump*>(state)->alt.p->type == syntax_element_startmark);
1129 recursion_sub = static_cast<re_brace*>(static_cast<const re_jump*>(state)->alt.p)->index;
1130 if(m_recursion_checks[recursion_sub] & 1u)
1132 // Infinite recursion!!
1133 if(0 == this->m_pdata->m_status) // update the error code if not already set
1134 this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
1136 // clear the expression, we should be empty:
1138 this->m_pdata->m_expression = 0;
1139 this->m_pdata->m_expression_len = 0;
1141 // and throw if required:
1143 if(0 == (this->flags() & regex_constants::no_except))
1145 std::string message = "Encountered an infinite recursion.";
1146 boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
1150 else if(recursion_start == 0)
1152 recursion_start = state;
1153 recursion_restart = state->next.p;
1154 state = static_cast<re_jump*>(state)->alt.p;
1155 m_recursion_checks[recursion_sub] |= 1u;
1158 m_recursion_checks[recursion_sub] |= 1u;
1159 // can't handle nested recursion here...
1160 BOOST_REGEX_FALLTHROUGH;
1162 case syntax_element_backref:
1163 // can be null, and any character can match:
1166 BOOST_REGEX_FALLTHROUGH;
1167 case syntax_element_wild:
1169 // can't be null, any character can match:
1170 set_all_masks(l_map, mask);
1173 case syntax_element_accept:
1174 case syntax_element_match:
1176 // must be null, any character can match:
1177 set_all_masks(l_map, mask);
1182 case syntax_element_word_start:
1184 // recurse, then AND with all the word characters:
1185 create_startmap(state->next.p, l_map, pnull, mask);
1188 l_map[0] |= mask_init;
1189 for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1191 if(!m_traits.isctype(static_cast<charT>(i), m_word_mask))
1192 l_map[i] &= static_cast<unsigned char>(~mask);
1197 case syntax_element_word_end:
1199 // recurse, then AND with all the word characters:
1200 create_startmap(state->next.p, l_map, pnull, mask);
1203 l_map[0] |= mask_init;
1204 for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1206 if(m_traits.isctype(static_cast<charT>(i), m_word_mask))
1207 l_map[i] &= static_cast<unsigned char>(~mask);
1212 case syntax_element_buffer_end:
1214 // we *must be null* :
1219 case syntax_element_long_set:
1222 typedef typename traits::char_class_type m_type;
1223 if(static_cast<re_set_long<m_type>*>(state)->singleton)
1225 l_map[0] |= mask_init;
1226 for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1228 charT c = static_cast<charT>(i);
1229 if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<m_type>*>(state), *m_pdata, l_icase))
1234 set_all_masks(l_map, mask);
1237 case syntax_element_set:
1240 l_map[0] |= mask_init;
1241 for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1243 if(static_cast<re_set*>(state)->_map[
1244 static_cast<unsigned char>(m_traits.translate(static_cast<charT>(i), l_icase))])
1249 case syntax_element_jump:
1251 state = static_cast<re_alt*>(state)->alt.p;
1254 case syntax_element_alt:
1255 case syntax_element_rep:
1256 case syntax_element_dot_rep:
1257 case syntax_element_char_rep:
1258 case syntax_element_short_set_rep:
1259 case syntax_element_long_set_rep:
1261 re_alt* rep = static_cast<re_alt*>(state);
1262 if(rep->_map[0] & mask_init)
1266 // copy previous results:
1267 l_map[0] |= mask_init;
1268 for(unsigned int i = 0; i <= UCHAR_MAX; ++i)
1270 if(rep->_map[i] & mask_any)
1276 if(rep->can_be_null & mask_any)
1282 // we haven't created a startmap for this alternative yet
1283 // so take the union of the two options:
1284 if(is_bad_repeat(state))
1286 set_all_masks(l_map, mask);
1291 set_bad_repeat(state);
1292 create_startmap(state->next.p, l_map, pnull, mask);
1293 if((state->type == syntax_element_alt)
1294 || (static_cast<re_repeat*>(state)->min == 0)
1295 || (not_last_jump == 0))
1296 create_startmap(rep->alt.p, l_map, pnull, mask);
1300 case syntax_element_soft_buffer_end:
1301 // match newline or null:
1304 l_map[0] |= mask_init;
1305 l_map[static_cast<unsigned>('\n')] |= mask;
1306 l_map[static_cast<unsigned>('\r')] |= mask;
1311 case syntax_element_endmark:
1312 // need to handle independent subs as a special case:
1313 if(static_cast<re_brace*>(state)->index < 0)
1315 // can be null, any character can match:
1316 set_all_masks(l_map, mask);
1321 else if(recursion_start && (recursion_sub != 0) && (recursion_sub == static_cast<re_brace*>(state)->index))
1323 // recursion termination:
1324 recursion_start = 0;
1325 state = recursion_restart;
1330 // Normally we just go to the next state... but if this sub-expression is
1331 // the target of a recursion, then we might be ending a recursion, in which
1332 // case we should check whatever follows that recursion, as well as whatever
1333 // follows this state:
1335 if(m_pdata->m_has_recursions && static_cast<re_brace*>(state)->index)
1338 re_syntax_base* p = m_pdata->m_first_state;
1341 if(p->type == syntax_element_recurse)
1343 re_brace* p2 = static_cast<re_brace*>(static_cast<re_jump*>(p)->alt.p);
1344 if((p2->type == syntax_element_startmark) && (p2->index == static_cast<re_brace*>(state)->index))
1352 if(ok && ((m_recursion_checks[static_cast<re_brace*>(state)->index] & 2u) == 0))
1354 m_recursion_checks[static_cast<re_brace*>(state)->index] |= 2u;
1355 create_startmap(p->next.p, l_map, pnull, mask);
1358 state = state->next.p;
1361 case syntax_element_commit:
1362 set_all_masks(l_map, mask);
1363 // Continue scanning so we can figure out whether we can be null:
1364 state = state->next.p;
1366 case syntax_element_startmark:
1367 // need to handle independent subs as a special case:
1368 if(static_cast<re_brace*>(state)->index == -3)
1370 state = state->next.p->next.p;
1373 BOOST_REGEX_FALLTHROUGH;
1375 state = state->next.p;
1381 template <class charT, class traits>
1382 unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* state)
1385 // find out how the machine starts, so we can optimise the search:
1391 case syntax_element_startmark:
1392 case syntax_element_endmark:
1393 state = state->next.p;
1395 case syntax_element_start_line:
1396 return regbase::restart_line;
1397 case syntax_element_word_start:
1398 return regbase::restart_word;
1399 case syntax_element_buffer_start:
1400 return regbase::restart_buf;
1401 case syntax_element_restart_continue:
1402 return regbase::restart_continue;
1408 return regbase::restart_any;
1411 template <class charT, class traits>
1412 void basic_regex_creator<charT, traits>::set_all_masks(unsigned char* bits, unsigned char mask)
1415 // set mask in all of bits elements,
1416 // if bits[0] has mask_init not set then we can
1417 // optimise this to a call to memset:
1422 (std::memset)(bits, mask, 1u << CHAR_BIT);
1425 for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
1428 bits[0] |= mask_init;
1432 template <class charT, class traits>
1433 bool basic_regex_creator<charT, traits>::is_bad_repeat(re_syntax_base* pt)
1437 case syntax_element_rep:
1438 case syntax_element_dot_rep:
1439 case syntax_element_char_rep:
1440 case syntax_element_short_set_rep:
1441 case syntax_element_long_set_rep:
1443 unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
1444 if(state_id >= sizeof(m_bad_repeats) * CHAR_BIT)
1445 return true; // run out of bits, assume we can't traverse this one.
1446 static const std::uintmax_t one = 1uL;
1447 return m_bad_repeats & (one << state_id);
1454 template <class charT, class traits>
1455 void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt)
1459 case syntax_element_rep:
1460 case syntax_element_dot_rep:
1461 case syntax_element_char_rep:
1462 case syntax_element_short_set_rep:
1463 case syntax_element_long_set_rep:
1465 unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
1466 static const std::uintmax_t one = 1uL;
1467 if(state_id <= sizeof(m_bad_repeats) * CHAR_BIT)
1468 m_bad_repeats |= (one << state_id);
1476 template <class charT, class traits>
1477 syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state)
1479 typedef typename traits::char_class_type m_type;
1480 if(state->type == syntax_element_rep)
1482 // check to see if we are repeating a single state:
1483 if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p)
1485 switch(state->next.p->type)
1487 case BOOST_REGEX_DETAIL_NS::syntax_element_wild:
1488 return BOOST_REGEX_DETAIL_NS::syntax_element_dot_rep;
1489 case BOOST_REGEX_DETAIL_NS::syntax_element_literal:
1490 return BOOST_REGEX_DETAIL_NS::syntax_element_char_rep;
1491 case BOOST_REGEX_DETAIL_NS::syntax_element_set:
1492 return BOOST_REGEX_DETAIL_NS::syntax_element_short_set_rep;
1493 case BOOST_REGEX_DETAIL_NS::syntax_element_long_set:
1494 if(static_cast<BOOST_REGEX_DETAIL_NS::re_set_long<m_type>*>(state->next.p)->singleton)
1495 return BOOST_REGEX_DETAIL_NS::syntax_element_long_set_rep;
1505 template <class charT, class traits>
1506 void basic_regex_creator<charT, traits>::probe_leading_repeat(re_syntax_base* state)
1508 // enumerate our states, and see if we have a leading repeat
1509 // for which failed search restarts can be optimized;
1514 case syntax_element_startmark:
1515 if(static_cast<re_brace*>(state)->index >= 0)
1517 state = state->next.p;
1520 #ifdef BOOST_REGEX_MSVC
1521 # pragma warning(push)
1522 #pragma warning(disable:6011)
1524 if((static_cast<re_brace*>(state)->index == -1)
1525 || (static_cast<re_brace*>(state)->index == -2))
1527 // skip past the zero width assertion:
1528 state = static_cast<const re_jump*>(state->next.p)->alt.p->next.p;
1531 #ifdef BOOST_REGEX_MSVC
1532 # pragma warning(pop)
1534 if(static_cast<re_brace*>(state)->index == -3)
1536 // Have to skip the leading jump state:
1537 state = state->next.p->next.p;
1541 case syntax_element_endmark:
1542 case syntax_element_start_line:
1543 case syntax_element_end_line:
1544 case syntax_element_word_boundary:
1545 case syntax_element_within_word:
1546 case syntax_element_word_start:
1547 case syntax_element_word_end:
1548 case syntax_element_buffer_start:
1549 case syntax_element_buffer_end:
1550 case syntax_element_restart_continue:
1551 state = state->next.p;
1553 case syntax_element_dot_rep:
1554 case syntax_element_char_rep:
1555 case syntax_element_short_set_rep:
1556 case syntax_element_long_set_rep:
1557 if(this->m_has_backrefs == 0)
1558 static_cast<re_repeat*>(state)->leading = true;
1559 BOOST_REGEX_FALLTHROUGH;
1566 } // namespace BOOST_REGEX_DETAIL_NS
1568 } // namespace boost
1570 #ifdef BOOST_REGEX_MSVC
1571 # pragma warning(pop)