]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * | |
3 | * Copyright (c) 2004 | |
4 | * John Maddock | |
5 | * | |
6 | * Use, modification and distribution are subject to the | |
7 | * Boost Software License, Version 1.0. (See accompanying file | |
8 | * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
9 | * | |
10 | */ | |
11 | ||
12 | /* | |
13 | * LOCATION: see http://www.boost.org for most recent version. | |
14 | * FILE basic_regex_creator.cpp | |
15 | * VERSION see <boost/version.hpp> | |
16 | * DESCRIPTION: Declares template class basic_regex_creator which fills in | |
17 | * the data members of a regex_data object. | |
18 | */ | |
19 | ||
20 | #ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP | |
21 | #define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP | |
22 | ||
20effc67 TL |
23 | #include <boost/regex/v4/indexed_bit_flag.hpp> |
24 | ||
7c673cae FG |
25 | #ifdef BOOST_MSVC |
26 | #pragma warning(push) | |
27 | #pragma warning(disable: 4103) | |
28 | #endif | |
29 | #ifdef BOOST_HAS_ABI_HEADERS | |
30 | # include BOOST_ABI_PREFIX | |
31 | #endif | |
32 | #ifdef BOOST_MSVC | |
33 | #pragma warning(pop) | |
34 | #endif | |
35 | ||
36 | #ifdef BOOST_MSVC | |
37 | # pragma warning(push) | |
92f5a8d4 TL |
38 | #if BOOST_MSVC < 1910 |
39 | #pragma warning(disable:4800) | |
40 | #endif | |
7c673cae FG |
41 | #endif |
42 | ||
43 | namespace boost{ | |
44 | ||
45 | namespace BOOST_REGEX_DETAIL_NS{ | |
46 | ||
47 | template <class charT> | |
48 | struct digraph : public std::pair<charT, charT> | |
49 | { | |
b32b8144 FG |
50 | digraph() : std::pair<charT, charT>(charT(0), charT(0)){} |
51 | digraph(charT c1) : std::pair<charT, charT>(c1, charT(0)){} | |
7c673cae FG |
52 | digraph(charT c1, charT c2) : std::pair<charT, charT>(c1, c2) |
53 | {} | |
54 | digraph(const digraph<charT>& d) : std::pair<charT, charT>(d.first, d.second){} | |
1e59de90 TL |
55 | #ifndef BOOST_NO_CXX11_DEFAULTED_FUNCTIONS |
56 | digraph<charT>& operator=(const digraph<charT>&) = default; | |
57 | #endif | |
7c673cae FG |
58 | template <class Seq> |
59 | digraph(const Seq& s) : std::pair<charT, charT>() | |
60 | { | |
1e59de90 TL |
61 | BOOST_REGEX_ASSERT(s.size() <= 2); |
62 | BOOST_REGEX_ASSERT(s.size()); | |
7c673cae FG |
63 | this->first = s[0]; |
64 | this->second = (s.size() > 1) ? s[1] : 0; | |
65 | } | |
66 | }; | |
67 | ||
68 | template <class charT, class traits> | |
69 | class basic_char_set | |
70 | { | |
71 | public: | |
72 | typedef digraph<charT> digraph_type; | |
73 | typedef typename traits::string_type string_type; | |
74 | typedef typename traits::char_class_type m_type; | |
75 | ||
76 | basic_char_set() | |
77 | { | |
78 | m_negate = false; | |
79 | m_has_digraphs = false; | |
80 | m_classes = 0; | |
81 | m_negated_classes = 0; | |
82 | m_empty = true; | |
83 | } | |
84 | ||
85 | void add_single(const digraph_type& s) | |
86 | { | |
b32b8144 | 87 | m_singles.insert(s); |
7c673cae FG |
88 | if(s.second) |
89 | m_has_digraphs = true; | |
90 | m_empty = false; | |
91 | } | |
92 | void add_range(const digraph_type& first, const digraph_type& end) | |
93 | { | |
b32b8144 FG |
94 | m_ranges.push_back(first); |
95 | m_ranges.push_back(end); | |
7c673cae FG |
96 | if(first.second) |
97 | { | |
98 | m_has_digraphs = true; | |
99 | add_single(first); | |
100 | } | |
101 | if(end.second) | |
102 | { | |
103 | m_has_digraphs = true; | |
104 | add_single(end); | |
105 | } | |
106 | m_empty = false; | |
107 | } | |
108 | void add_class(m_type m) | |
109 | { | |
110 | m_classes |= m; | |
111 | m_empty = false; | |
112 | } | |
113 | void add_negated_class(m_type m) | |
114 | { | |
115 | m_negated_classes |= m; | |
116 | m_empty = false; | |
117 | } | |
118 | void add_equivalent(const digraph_type& s) | |
119 | { | |
b32b8144 | 120 | m_equivalents.insert(s); |
7c673cae FG |
121 | if(s.second) |
122 | { | |
123 | m_has_digraphs = true; | |
124 | add_single(s); | |
125 | } | |
126 | m_empty = false; | |
127 | } | |
128 | void negate() | |
129 | { | |
130 | m_negate = true; | |
131 | //m_empty = false; | |
132 | } | |
133 | ||
134 | // | |
135 | // accessor functions: | |
136 | // | |
137 | bool has_digraphs()const | |
138 | { | |
139 | return m_has_digraphs; | |
140 | } | |
141 | bool is_negated()const | |
142 | { | |
143 | return m_negate; | |
144 | } | |
145 | typedef typename std::vector<digraph_type>::const_iterator list_iterator; | |
b32b8144 FG |
146 | typedef typename std::set<digraph_type>::const_iterator set_iterator; |
147 | set_iterator singles_begin()const | |
7c673cae FG |
148 | { |
149 | return m_singles.begin(); | |
150 | } | |
b32b8144 | 151 | set_iterator singles_end()const |
7c673cae FG |
152 | { |
153 | return m_singles.end(); | |
154 | } | |
155 | list_iterator ranges_begin()const | |
156 | { | |
157 | return m_ranges.begin(); | |
158 | } | |
159 | list_iterator ranges_end()const | |
160 | { | |
161 | return m_ranges.end(); | |
162 | } | |
b32b8144 | 163 | set_iterator equivalents_begin()const |
7c673cae FG |
164 | { |
165 | return m_equivalents.begin(); | |
166 | } | |
b32b8144 | 167 | set_iterator equivalents_end()const |
7c673cae FG |
168 | { |
169 | return m_equivalents.end(); | |
170 | } | |
171 | m_type classes()const | |
172 | { | |
173 | return m_classes; | |
174 | } | |
175 | m_type negated_classes()const | |
176 | { | |
177 | return m_negated_classes; | |
178 | } | |
179 | bool empty()const | |
180 | { | |
181 | return m_empty; | |
182 | } | |
183 | private: | |
b32b8144 | 184 | std::set<digraph_type> m_singles; // a list of single characters to match |
7c673cae FG |
185 | std::vector<digraph_type> m_ranges; // a list of end points of our ranges |
186 | bool m_negate; // true if the set is to be negated | |
187 | bool m_has_digraphs; // true if we have digraphs present | |
188 | m_type m_classes; // character classes to match | |
189 | m_type m_negated_classes; // negated character classes to match | |
190 | bool m_empty; // whether we've added anything yet | |
b32b8144 | 191 | std::set<digraph_type> m_equivalents; // a list of equivalence classes |
7c673cae FG |
192 | }; |
193 | ||
194 | template <class charT, class traits> | |
195 | class basic_regex_creator | |
196 | { | |
197 | public: | |
198 | basic_regex_creator(regex_data<charT, traits>* data); | |
199 | std::ptrdiff_t getoffset(void* addr) | |
200 | { | |
201 | return getoffset(addr, m_pdata->m_data.data()); | |
202 | } | |
203 | std::ptrdiff_t getoffset(const void* addr, const void* base) | |
204 | { | |
205 | return static_cast<const char*>(addr) - static_cast<const char*>(base); | |
206 | } | |
207 | re_syntax_base* getaddress(std::ptrdiff_t off) | |
208 | { | |
209 | return getaddress(off, m_pdata->m_data.data()); | |
210 | } | |
211 | re_syntax_base* getaddress(std::ptrdiff_t off, void* base) | |
212 | { | |
213 | return static_cast<re_syntax_base*>(static_cast<void*>(static_cast<char*>(base) + off)); | |
214 | } | |
215 | void init(unsigned l_flags) | |
216 | { | |
217 | m_pdata->m_flags = l_flags; | |
218 | m_icase = l_flags & regex_constants::icase; | |
219 | } | |
220 | regbase::flag_type flags() | |
221 | { | |
222 | return m_pdata->m_flags; | |
223 | } | |
224 | void flags(regbase::flag_type f) | |
225 | { | |
226 | m_pdata->m_flags = f; | |
227 | if(m_icase != static_cast<bool>(f & regbase::icase)) | |
228 | { | |
229 | m_icase = static_cast<bool>(f & regbase::icase); | |
230 | } | |
231 | } | |
232 | re_syntax_base* append_state(syntax_element_type t, std::size_t s = sizeof(re_syntax_base)); | |
233 | re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s = sizeof(re_syntax_base)); | |
234 | re_literal* append_literal(charT c); | |
235 | re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set); | |
236 | re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::false_*); | |
237 | re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::true_*); | |
238 | void finalize(const charT* p1, const charT* p2); | |
239 | protected: | |
240 | regex_data<charT, traits>* m_pdata; // pointer to the basic_regex_data struct we are filling in | |
241 | const ::boost::regex_traits_wrapper<traits>& | |
242 | m_traits; // convenience reference to traits class | |
243 | re_syntax_base* m_last_state; // the last state we added | |
244 | bool m_icase; // true for case insensitive matches | |
245 | unsigned m_repeater_id; // the state_id of the next repeater | |
246 | bool m_has_backrefs; // true if there are actually any backrefs | |
20effc67 | 247 | indexed_bit_flag m_backrefs; // bitmask of permitted backrefs |
7c673cae | 248 | boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; |
1e59de90 | 249 | bool m_has_recursions; // set when we have recursive expressions to fixup |
b32b8144 | 250 | std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression |
7c673cae FG |
251 | typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character |
252 | typename traits::char_class_type m_mask_space; // mask used to determine if a character is a word character | |
253 | typename traits::char_class_type m_lower_mask; // mask used to determine if a character is a lowercase character | |
254 | typename traits::char_class_type m_upper_mask; // mask used to determine if a character is an uppercase character | |
255 | typename traits::char_class_type m_alpha_mask; // mask used to determine if a character is an alphabetic character | |
256 | private: | |
257 | basic_regex_creator& operator=(const basic_regex_creator&); | |
258 | basic_regex_creator(const basic_regex_creator&); | |
259 | ||
260 | void fixup_pointers(re_syntax_base* state); | |
261 | void fixup_recursions(re_syntax_base* state); | |
262 | void create_startmaps(re_syntax_base* state); | |
263 | int calculate_backstep(re_syntax_base* state); | |
264 | void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask); | |
265 | unsigned get_restart_type(re_syntax_base* state); | |
266 | void set_all_masks(unsigned char* bits, unsigned char); | |
267 | bool is_bad_repeat(re_syntax_base* pt); | |
268 | void set_bad_repeat(re_syntax_base* pt); | |
269 | syntax_element_type get_repeat_type(re_syntax_base* state); | |
270 | void probe_leading_repeat(re_syntax_base* state); | |
271 | }; | |
272 | ||
273 | template <class charT, class traits> | |
274 | basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data) | |
20effc67 TL |
275 | : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_icase(false), m_repeater_id(0), |
276 | m_has_backrefs(false), m_bad_repeats(0), m_has_recursions(false), m_word_mask(0), m_mask_space(0), m_lower_mask(0), m_upper_mask(0), m_alpha_mask(0) | |
7c673cae FG |
277 | { |
278 | m_pdata->m_data.clear(); | |
279 | m_pdata->m_status = ::boost::regex_constants::error_ok; | |
280 | static const charT w = 'w'; | |
281 | static const charT s = 's'; | |
282 | static const charT l[5] = { 'l', 'o', 'w', 'e', 'r', }; | |
283 | static const charT u[5] = { 'u', 'p', 'p', 'e', 'r', }; | |
284 | static const charT a[5] = { 'a', 'l', 'p', 'h', 'a', }; | |
285 | m_word_mask = m_traits.lookup_classname(&w, &w +1); | |
286 | m_mask_space = m_traits.lookup_classname(&s, &s +1); | |
287 | m_lower_mask = m_traits.lookup_classname(l, l + 5); | |
288 | m_upper_mask = m_traits.lookup_classname(u, u + 5); | |
289 | m_alpha_mask = m_traits.lookup_classname(a, a + 5); | |
290 | m_pdata->m_word_mask = m_word_mask; | |
1e59de90 TL |
291 | BOOST_REGEX_ASSERT(m_word_mask != 0); |
292 | BOOST_REGEX_ASSERT(m_mask_space != 0); | |
293 | BOOST_REGEX_ASSERT(m_lower_mask != 0); | |
294 | BOOST_REGEX_ASSERT(m_upper_mask != 0); | |
295 | BOOST_REGEX_ASSERT(m_alpha_mask != 0); | |
7c673cae FG |
296 | } |
297 | ||
298 | template <class charT, class traits> | |
299 | re_syntax_base* basic_regex_creator<charT, traits>::append_state(syntax_element_type t, std::size_t s) | |
300 | { | |
301 | // if the state is a backref then make a note of it: | |
302 | if(t == syntax_element_backref) | |
303 | this->m_has_backrefs = true; | |
304 | // append a new state, start by aligning our last one: | |
305 | m_pdata->m_data.align(); | |
306 | // set the offset to the next state in our last one: | |
307 | if(m_last_state) | |
308 | m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state); | |
1e59de90 | 309 | // now actually extend our data: |
7c673cae FG |
310 | m_last_state = static_cast<re_syntax_base*>(m_pdata->m_data.extend(s)); |
311 | // fill in boilerplate options in the new state: | |
312 | m_last_state->next.i = 0; | |
313 | m_last_state->type = t; | |
314 | return m_last_state; | |
315 | } | |
316 | ||
317 | template <class charT, class traits> | |
318 | re_syntax_base* basic_regex_creator<charT, traits>::insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s) | |
319 | { | |
320 | // append a new state, start by aligning our last one: | |
321 | m_pdata->m_data.align(); | |
322 | // set the offset to the next state in our last one: | |
323 | if(m_last_state) | |
324 | m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state); | |
325 | // remember the last state position: | |
326 | std::ptrdiff_t off = getoffset(m_last_state) + s; | |
327 | // now actually insert our data: | |
328 | re_syntax_base* new_state = static_cast<re_syntax_base*>(m_pdata->m_data.insert(pos, s)); | |
329 | // fill in boilerplate options in the new state: | |
330 | new_state->next.i = s; | |
331 | new_state->type = t; | |
332 | m_last_state = getaddress(off); | |
333 | return new_state; | |
334 | } | |
335 | ||
336 | template <class charT, class traits> | |
337 | re_literal* basic_regex_creator<charT, traits>::append_literal(charT c) | |
338 | { | |
339 | re_literal* result; | |
340 | // start by seeing if we have an existing re_literal we can extend: | |
341 | if((0 == m_last_state) || (m_last_state->type != syntax_element_literal)) | |
342 | { | |
343 | // no existing re_literal, create a new one: | |
344 | result = static_cast<re_literal*>(append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT))); | |
345 | result->length = 1; | |
346 | *static_cast<charT*>(static_cast<void*>(result+1)) = m_traits.translate(c, m_icase); | |
347 | } | |
348 | else | |
349 | { | |
350 | // we have an existing re_literal, extend it: | |
351 | std::ptrdiff_t off = getoffset(m_last_state); | |
352 | m_pdata->m_data.extend(sizeof(charT)); | |
353 | m_last_state = result = static_cast<re_literal*>(getaddress(off)); | |
354 | charT* characters = static_cast<charT*>(static_cast<void*>(result+1)); | |
355 | characters[result->length] = m_traits.translate(c, m_icase); | |
356 | result->length += 1; | |
357 | } | |
358 | return result; | |
359 | } | |
360 | ||
361 | template <class charT, class traits> | |
362 | inline re_syntax_base* basic_regex_creator<charT, traits>::append_set( | |
363 | const basic_char_set<charT, traits>& char_set) | |
364 | { | |
365 | typedef mpl::bool_< (sizeof(charT) == 1) > truth_type; | |
366 | return char_set.has_digraphs() | |
367 | ? append_set(char_set, static_cast<mpl::false_*>(0)) | |
368 | : append_set(char_set, static_cast<truth_type*>(0)); | |
369 | } | |
370 | ||
371 | template <class charT, class traits> | |
372 | re_syntax_base* basic_regex_creator<charT, traits>::append_set( | |
373 | const basic_char_set<charT, traits>& char_set, mpl::false_*) | |
374 | { | |
375 | typedef typename traits::string_type string_type; | |
376 | typedef typename basic_char_set<charT, traits>::list_iterator item_iterator; | |
b32b8144 | 377 | typedef typename basic_char_set<charT, traits>::set_iterator set_iterator; |
7c673cae FG |
378 | typedef typename traits::char_class_type m_type; |
379 | ||
380 | re_set_long<m_type>* result = static_cast<re_set_long<m_type>*>(append_state(syntax_element_long_set, sizeof(re_set_long<m_type>))); | |
381 | // | |
382 | // fill in the basics: | |
383 | // | |
384 | result->csingles = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.singles_begin(), char_set.singles_end())); | |
385 | result->cranges = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2; | |
386 | result->cequivalents = static_cast<unsigned int>(::boost::BOOST_REGEX_DETAIL_NS::distance(char_set.equivalents_begin(), char_set.equivalents_end())); | |
387 | result->cclasses = char_set.classes(); | |
388 | result->cnclasses = char_set.negated_classes(); | |
389 | if(flags() & regbase::icase) | |
390 | { | |
391 | // adjust classes as needed: | |
392 | if(((result->cclasses & m_lower_mask) == m_lower_mask) || ((result->cclasses & m_upper_mask) == m_upper_mask)) | |
393 | result->cclasses |= m_alpha_mask; | |
394 | if(((result->cnclasses & m_lower_mask) == m_lower_mask) || ((result->cnclasses & m_upper_mask) == m_upper_mask)) | |
395 | result->cnclasses |= m_alpha_mask; | |
396 | } | |
397 | ||
398 | result->isnot = char_set.is_negated(); | |
399 | result->singleton = !char_set.has_digraphs(); | |
400 | // | |
401 | // remember where the state is for later: | |
402 | // | |
403 | std::ptrdiff_t offset = getoffset(result); | |
404 | // | |
405 | // now extend with all the singles: | |
406 | // | |
407 | item_iterator first, last; | |
b32b8144 FG |
408 | set_iterator sfirst, slast; |
409 | sfirst = char_set.singles_begin(); | |
410 | slast = char_set.singles_end(); | |
411 | while(sfirst != slast) | |
7c673cae | 412 | { |
b32b8144 FG |
413 | charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (sfirst->first == static_cast<charT>(0) ? 1 : sfirst->second ? 3 : 2))); |
414 | p[0] = m_traits.translate(sfirst->first, m_icase); | |
415 | if(sfirst->first == static_cast<charT>(0)) | |
416 | { | |
417 | p[0] = 0; | |
418 | } | |
419 | else if(sfirst->second) | |
7c673cae | 420 | { |
b32b8144 | 421 | p[1] = m_traits.translate(sfirst->second, m_icase); |
7c673cae FG |
422 | p[2] = 0; |
423 | } | |
424 | else | |
425 | p[1] = 0; | |
b32b8144 | 426 | ++sfirst; |
7c673cae FG |
427 | } |
428 | // | |
429 | // now extend with all the ranges: | |
430 | // | |
431 | first = char_set.ranges_begin(); | |
432 | last = char_set.ranges_end(); | |
433 | while(first != last) | |
434 | { | |
435 | // first grab the endpoints of the range: | |
436 | digraph<charT> c1 = *first; | |
437 | c1.first = this->m_traits.translate(c1.first, this->m_icase); | |
438 | c1.second = this->m_traits.translate(c1.second, this->m_icase); | |
439 | ++first; | |
440 | digraph<charT> c2 = *first; | |
441 | c2.first = this->m_traits.translate(c2.first, this->m_icase); | |
442 | c2.second = this->m_traits.translate(c2.second, this->m_icase); | |
443 | ++first; | |
444 | string_type s1, s2; | |
445 | // different actions now depending upon whether collation is turned on: | |
446 | if(flags() & regex_constants::collate) | |
447 | { | |
448 | // we need to transform our range into sort keys: | |
449 | charT a1[3] = { c1.first, c1.second, charT(0), }; | |
450 | charT a2[3] = { c2.first, c2.second, charT(0), }; | |
451 | s1 = this->m_traits.transform(a1, (a1[1] ? a1+2 : a1+1)); | |
452 | s2 = this->m_traits.transform(a2, (a2[1] ? a2+2 : a2+1)); | |
1e59de90 | 453 | if(s1.empty()) |
7c673cae | 454 | s1 = string_type(1, charT(0)); |
1e59de90 | 455 | if(s2.empty()) |
7c673cae FG |
456 | s2 = string_type(1, charT(0)); |
457 | } | |
458 | else | |
459 | { | |
460 | if(c1.second) | |
461 | { | |
462 | s1.insert(s1.end(), c1.first); | |
463 | s1.insert(s1.end(), c1.second); | |
464 | } | |
465 | else | |
466 | s1 = string_type(1, c1.first); | |
467 | if(c2.second) | |
468 | { | |
469 | s2.insert(s2.end(), c2.first); | |
470 | s2.insert(s2.end(), c2.second); | |
471 | } | |
472 | else | |
473 | s2.insert(s2.end(), c2.first); | |
474 | } | |
475 | if(s1 > s2) | |
476 | { | |
477 | // Oops error: | |
478 | return 0; | |
479 | } | |
480 | charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s1.size() + s2.size() + 2) ) ); | |
481 | BOOST_REGEX_DETAIL_NS::copy(s1.begin(), s1.end(), p); | |
482 | p[s1.size()] = charT(0); | |
483 | p += s1.size() + 1; | |
484 | BOOST_REGEX_DETAIL_NS::copy(s2.begin(), s2.end(), p); | |
485 | p[s2.size()] = charT(0); | |
486 | } | |
487 | // | |
488 | // now process the equivalence classes: | |
489 | // | |
b32b8144 FG |
490 | sfirst = char_set.equivalents_begin(); |
491 | slast = char_set.equivalents_end(); | |
492 | while(sfirst != slast) | |
7c673cae FG |
493 | { |
494 | string_type s; | |
b32b8144 | 495 | if(sfirst->second) |
7c673cae | 496 | { |
b32b8144 | 497 | charT cs[3] = { sfirst->first, sfirst->second, charT(0), }; |
7c673cae FG |
498 | s = m_traits.transform_primary(cs, cs+2); |
499 | } | |
500 | else | |
b32b8144 | 501 | s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1); |
7c673cae FG |
502 | if(s.empty()) |
503 | return 0; // invalid or unsupported equivalence class | |
504 | charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) ); | |
505 | BOOST_REGEX_DETAIL_NS::copy(s.begin(), s.end(), p); | |
506 | p[s.size()] = charT(0); | |
b32b8144 | 507 | ++sfirst; |
7c673cae FG |
508 | } |
509 | // | |
510 | // finally reset the address of our last state: | |
511 | // | |
512 | m_last_state = result = static_cast<re_set_long<m_type>*>(getaddress(offset)); | |
513 | return result; | |
514 | } | |
515 | ||
516 | template<class T> | |
517 | inline bool char_less(T t1, T t2) | |
518 | { | |
519 | return t1 < t2; | |
520 | } | |
521 | inline bool char_less(char t1, char t2) | |
522 | { | |
523 | return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2); | |
524 | } | |
525 | inline bool char_less(signed char t1, signed char t2) | |
526 | { | |
527 | return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2); | |
528 | } | |
529 | ||
530 | template <class charT, class traits> | |
531 | re_syntax_base* basic_regex_creator<charT, traits>::append_set( | |
532 | const basic_char_set<charT, traits>& char_set, mpl::true_*) | |
533 | { | |
534 | typedef typename traits::string_type string_type; | |
535 | typedef typename basic_char_set<charT, traits>::list_iterator item_iterator; | |
b32b8144 FG |
536 | typedef typename basic_char_set<charT, traits>::set_iterator set_iterator; |
537 | ||
7c673cae FG |
538 | re_set* result = static_cast<re_set*>(append_state(syntax_element_set, sizeof(re_set))); |
539 | bool negate = char_set.is_negated(); | |
540 | std::memset(result->_map, 0, sizeof(result->_map)); | |
541 | // | |
542 | // handle singles first: | |
543 | // | |
544 | item_iterator first, last; | |
b32b8144 FG |
545 | set_iterator sfirst, slast; |
546 | sfirst = char_set.singles_begin(); | |
547 | slast = char_set.singles_end(); | |
548 | while(sfirst != slast) | |
7c673cae FG |
549 | { |
550 | for(unsigned int i = 0; i < (1 << CHAR_BIT); ++i) | |
551 | { | |
552 | if(this->m_traits.translate(static_cast<charT>(i), this->m_icase) | |
b32b8144 | 553 | == this->m_traits.translate(sfirst->first, this->m_icase)) |
7c673cae FG |
554 | result->_map[i] = true; |
555 | } | |
b32b8144 | 556 | ++sfirst; |
7c673cae FG |
557 | } |
558 | // | |
559 | // OK now handle ranges: | |
560 | // | |
561 | first = char_set.ranges_begin(); | |
562 | last = char_set.ranges_end(); | |
563 | while(first != last) | |
564 | { | |
565 | // first grab the endpoints of the range: | |
566 | charT c1 = this->m_traits.translate(first->first, this->m_icase); | |
567 | ++first; | |
568 | charT c2 = this->m_traits.translate(first->first, this->m_icase); | |
569 | ++first; | |
570 | // different actions now depending upon whether collation is turned on: | |
571 | if(flags() & regex_constants::collate) | |
572 | { | |
573 | // we need to transform our range into sort keys: | |
574 | charT c3[2] = { c1, charT(0), }; | |
575 | string_type s1 = this->m_traits.transform(c3, c3+1); | |
576 | c3[0] = c2; | |
577 | string_type s2 = this->m_traits.transform(c3, c3+1); | |
578 | if(s1 > s2) | |
579 | { | |
580 | // Oops error: | |
581 | return 0; | |
582 | } | |
1e59de90 | 583 | BOOST_REGEX_ASSERT(c3[1] == charT(0)); |
7c673cae FG |
584 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) |
585 | { | |
586 | c3[0] = static_cast<charT>(i); | |
587 | string_type s3 = this->m_traits.transform(c3, c3 +1); | |
588 | if((s1 <= s3) && (s3 <= s2)) | |
589 | result->_map[i] = true; | |
590 | } | |
591 | } | |
592 | else | |
593 | { | |
594 | if(char_less(c2, c1)) | |
595 | { | |
596 | // Oops error: | |
597 | return 0; | |
598 | } | |
599 | // everything in range matches: | |
20effc67 | 600 | std::memset(result->_map + static_cast<unsigned char>(c1), true, static_cast<unsigned char>(1u) + static_cast<unsigned char>(static_cast<unsigned char>(c2) - static_cast<unsigned char>(c1))); |
7c673cae FG |
601 | } |
602 | } | |
603 | // | |
604 | // and now the classes: | |
605 | // | |
606 | typedef typename traits::char_class_type m_type; | |
607 | m_type m = char_set.classes(); | |
608 | if(flags() & regbase::icase) | |
609 | { | |
610 | // adjust m as needed: | |
611 | if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask)) | |
612 | m |= m_alpha_mask; | |
613 | } | |
614 | if(m != 0) | |
615 | { | |
616 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) | |
617 | { | |
618 | if(this->m_traits.isctype(static_cast<charT>(i), m)) | |
619 | result->_map[i] = true; | |
620 | } | |
621 | } | |
622 | // | |
623 | // and now the negated classes: | |
624 | // | |
625 | m = char_set.negated_classes(); | |
626 | if(flags() & regbase::icase) | |
627 | { | |
628 | // adjust m as needed: | |
629 | if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask)) | |
630 | m |= m_alpha_mask; | |
631 | } | |
632 | if(m != 0) | |
633 | { | |
634 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) | |
635 | { | |
636 | if(0 == this->m_traits.isctype(static_cast<charT>(i), m)) | |
637 | result->_map[i] = true; | |
638 | } | |
639 | } | |
640 | // | |
641 | // now process the equivalence classes: | |
642 | // | |
b32b8144 FG |
643 | sfirst = char_set.equivalents_begin(); |
644 | slast = char_set.equivalents_end(); | |
645 | while(sfirst != slast) | |
7c673cae FG |
646 | { |
647 | string_type s; | |
1e59de90 | 648 | BOOST_REGEX_ASSERT(static_cast<charT>(0) == sfirst->second); |
b32b8144 | 649 | s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1); |
7c673cae FG |
650 | if(s.empty()) |
651 | return 0; // invalid or unsupported equivalence class | |
652 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) | |
653 | { | |
654 | charT c[2] = { (static_cast<charT>(i)), charT(0), }; | |
655 | string_type s2 = this->m_traits.transform_primary(c, c+1); | |
656 | if(s == s2) | |
657 | result->_map[i] = true; | |
658 | } | |
b32b8144 | 659 | ++sfirst; |
7c673cae FG |
660 | } |
661 | if(negate) | |
662 | { | |
663 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) | |
664 | { | |
665 | result->_map[i] = !(result->_map[i]); | |
666 | } | |
667 | } | |
668 | return result; | |
669 | } | |
670 | ||
671 | template <class charT, class traits> | |
672 | void basic_regex_creator<charT, traits>::finalize(const charT* p1, const charT* p2) | |
673 | { | |
674 | if(this->m_pdata->m_status) | |
675 | return; | |
676 | // we've added all the states we need, now finish things off. | |
677 | // start by adding a terminating state: | |
678 | append_state(syntax_element_match); | |
679 | // extend storage to store original expression: | |
680 | std::ptrdiff_t len = p2 - p1; | |
681 | m_pdata->m_expression_len = len; | |
682 | charT* ps = static_cast<charT*>(m_pdata->m_data.extend(sizeof(charT) * (1 + (p2 - p1)))); | |
683 | m_pdata->m_expression = ps; | |
684 | BOOST_REGEX_DETAIL_NS::copy(p1, p2, ps); | |
685 | ps[p2 - p1] = 0; | |
686 | // fill in our other data... | |
687 | // successful parsing implies a zero status: | |
688 | m_pdata->m_status = 0; | |
689 | // get the first state of the machine: | |
690 | m_pdata->m_first_state = static_cast<re_syntax_base*>(m_pdata->m_data.data()); | |
691 | // fixup pointers in the machine: | |
692 | fixup_pointers(m_pdata->m_first_state); | |
693 | if(m_has_recursions) | |
694 | { | |
695 | m_pdata->m_has_recursions = true; | |
696 | fixup_recursions(m_pdata->m_first_state); | |
697 | if(this->m_pdata->m_status) | |
698 | return; | |
699 | } | |
700 | else | |
701 | m_pdata->m_has_recursions = false; | |
702 | // create nested startmaps: | |
703 | create_startmaps(m_pdata->m_first_state); | |
704 | // create main startmap: | |
705 | std::memset(m_pdata->m_startmap, 0, sizeof(m_pdata->m_startmap)); | |
706 | m_pdata->m_can_be_null = 0; | |
707 | ||
708 | m_bad_repeats = 0; | |
709 | if(m_has_recursions) | |
b32b8144 | 710 | m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u); |
7c673cae FG |
711 | create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all); |
712 | // get the restart type: | |
713 | m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state); | |
714 | // optimise a leading repeat if there is one: | |
715 | probe_leading_repeat(m_pdata->m_first_state); | |
716 | } | |
717 | ||
718 | template <class charT, class traits> | |
719 | void basic_regex_creator<charT, traits>::fixup_pointers(re_syntax_base* state) | |
720 | { | |
721 | while(state) | |
722 | { | |
723 | switch(state->type) | |
724 | { | |
725 | case syntax_element_recurse: | |
726 | m_has_recursions = true; | |
727 | if(state->next.i) | |
728 | state->next.p = getaddress(state->next.i, state); | |
729 | else | |
730 | state->next.p = 0; | |
731 | break; | |
732 | case syntax_element_rep: | |
733 | case syntax_element_dot_rep: | |
734 | case syntax_element_char_rep: | |
735 | case syntax_element_short_set_rep: | |
736 | case syntax_element_long_set_rep: | |
737 | // set the state_id of this repeat: | |
738 | static_cast<re_repeat*>(state)->state_id = m_repeater_id++; | |
739 | BOOST_FALLTHROUGH; | |
740 | case syntax_element_alt: | |
741 | std::memset(static_cast<re_alt*>(state)->_map, 0, sizeof(static_cast<re_alt*>(state)->_map)); | |
742 | static_cast<re_alt*>(state)->can_be_null = 0; | |
743 | BOOST_FALLTHROUGH; | |
744 | case syntax_element_jump: | |
745 | static_cast<re_jump*>(state)->alt.p = getaddress(static_cast<re_jump*>(state)->alt.i, state); | |
746 | BOOST_FALLTHROUGH; | |
747 | default: | |
748 | if(state->next.i) | |
749 | state->next.p = getaddress(state->next.i, state); | |
750 | else | |
751 | state->next.p = 0; | |
752 | } | |
753 | state = state->next.p; | |
754 | } | |
755 | } | |
756 | ||
757 | template <class charT, class traits> | |
758 | void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state) | |
759 | { | |
760 | re_syntax_base* base = state; | |
761 | while(state) | |
762 | { | |
763 | switch(state->type) | |
764 | { | |
765 | case syntax_element_assert_backref: | |
766 | { | |
767 | // just check that the index is valid: | |
768 | int idx = static_cast<const re_brace*>(state)->index; | |
769 | if(idx < 0) | |
770 | { | |
771 | idx = -idx-1; | |
20effc67 | 772 | if(idx >= hash_value_mask) |
7c673cae FG |
773 | { |
774 | idx = m_pdata->get_id(idx); | |
775 | if(idx <= 0) | |
776 | { | |
777 | // check of sub-expression that doesn't exist: | |
778 | if(0 == this->m_pdata->m_status) // update the error code if not already set | |
779 | this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; | |
780 | // | |
781 | // clear the expression, we should be empty: | |
782 | // | |
783 | this->m_pdata->m_expression = 0; | |
784 | this->m_pdata->m_expression_len = 0; | |
785 | // | |
786 | // and throw if required: | |
787 | // | |
788 | if(0 == (this->flags() & regex_constants::no_except)) | |
789 | { | |
790 | std::string message = "Encountered a forward reference to a marked sub-expression that does not exist."; | |
791 | boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); | |
792 | e.raise(); | |
793 | } | |
794 | } | |
795 | } | |
796 | } | |
797 | } | |
798 | break; | |
799 | case syntax_element_recurse: | |
800 | { | |
801 | bool ok = false; | |
802 | re_syntax_base* p = base; | |
803 | std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i; | |
20effc67 | 804 | if(idx >= hash_value_mask) |
7c673cae FG |
805 | { |
806 | // | |
807 | // There may be more than one capture group with this hash, just do what Perl | |
808 | // does and recurse to the leftmost: | |
809 | // | |
810 | idx = m_pdata->get_id(static_cast<int>(idx)); | |
811 | } | |
b32b8144 FG |
812 | if(idx < 0) |
813 | { | |
814 | ok = false; | |
815 | } | |
816 | else | |
7c673cae | 817 | { |
b32b8144 | 818 | while(p) |
7c673cae | 819 | { |
b32b8144 | 820 | if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx)) |
7c673cae | 821 | { |
b32b8144 FG |
822 | // |
823 | // We've found the target of the recursion, set the jump target: | |
824 | // | |
825 | static_cast<re_jump*>(state)->alt.p = p; | |
826 | ok = true; | |
827 | // | |
828 | // Now scan the target for nested repeats: | |
829 | // | |
830 | p = p->next.p; | |
831 | int next_rep_id = 0; | |
832 | while(p) | |
7c673cae | 833 | { |
b32b8144 FG |
834 | switch(p->type) |
835 | { | |
836 | case syntax_element_rep: | |
837 | case syntax_element_dot_rep: | |
838 | case syntax_element_char_rep: | |
839 | case syntax_element_short_set_rep: | |
840 | case syntax_element_long_set_rep: | |
841 | next_rep_id = static_cast<re_repeat*>(p)->state_id; | |
842 | break; | |
843 | case syntax_element_endmark: | |
844 | if(static_cast<const re_brace*>(p)->index == idx) | |
845 | next_rep_id = -1; | |
846 | break; | |
847 | default: | |
848 | break; | |
849 | } | |
850 | if(next_rep_id) | |
851 | break; | |
852 | p = p->next.p; | |
853 | } | |
854 | if(next_rep_id > 0) | |
855 | { | |
856 | static_cast<re_recurse*>(state)->state_id = next_rep_id - 1; | |
7c673cae | 857 | } |
7c673cae | 858 | |
b32b8144 FG |
859 | break; |
860 | } | |
861 | p = p->next.p; | |
7c673cae | 862 | } |
7c673cae FG |
863 | } |
864 | if(!ok) | |
865 | { | |
866 | // recursion to sub-expression that doesn't exist: | |
867 | if(0 == this->m_pdata->m_status) // update the error code if not already set | |
868 | this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; | |
869 | // | |
870 | // clear the expression, we should be empty: | |
871 | // | |
872 | this->m_pdata->m_expression = 0; | |
873 | this->m_pdata->m_expression_len = 0; | |
874 | // | |
875 | // and throw if required: | |
876 | // | |
877 | if(0 == (this->flags() & regex_constants::no_except)) | |
878 | { | |
879 | std::string message = "Encountered a forward reference to a recursive sub-expression that does not exist."; | |
880 | boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); | |
881 | e.raise(); | |
882 | } | |
883 | } | |
884 | } | |
885 | break; | |
886 | default: | |
887 | break; | |
888 | } | |
889 | state = state->next.p; | |
890 | } | |
891 | } | |
892 | ||
893 | template <class charT, class traits> | |
894 | void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state) | |
895 | { | |
896 | // non-recursive implementation: | |
897 | // create the last map in the machine first, so that earlier maps | |
898 | // can make use of the result... | |
899 | // | |
900 | // This was originally a recursive implementation, but that caused stack | |
901 | // overflows with complex expressions on small stacks (think COM+). | |
902 | ||
903 | // start by saving the case setting: | |
904 | bool l_icase = m_icase; | |
905 | std::vector<std::pair<bool, re_syntax_base*> > v; | |
906 | ||
907 | while(state) | |
908 | { | |
909 | switch(state->type) | |
910 | { | |
911 | case syntax_element_toggle_case: | |
912 | // we need to track case changes here: | |
913 | m_icase = static_cast<re_case*>(state)->icase; | |
914 | state = state->next.p; | |
915 | continue; | |
916 | case syntax_element_alt: | |
917 | case syntax_element_rep: | |
918 | case syntax_element_dot_rep: | |
919 | case syntax_element_char_rep: | |
920 | case syntax_element_short_set_rep: | |
921 | case syntax_element_long_set_rep: | |
922 | // just push the state onto our stack for now: | |
923 | v.push_back(std::pair<bool, re_syntax_base*>(m_icase, state)); | |
924 | state = state->next.p; | |
925 | break; | |
926 | case syntax_element_backstep: | |
927 | // we need to calculate how big the backstep is: | |
928 | static_cast<re_brace*>(state)->index | |
929 | = this->calculate_backstep(state->next.p); | |
930 | if(static_cast<re_brace*>(state)->index < 0) | |
931 | { | |
932 | // Oops error: | |
933 | if(0 == this->m_pdata->m_status) // update the error code if not already set | |
934 | this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; | |
935 | // | |
936 | // clear the expression, we should be empty: | |
937 | // | |
938 | this->m_pdata->m_expression = 0; | |
939 | this->m_pdata->m_expression_len = 0; | |
940 | // | |
941 | // and throw if required: | |
942 | // | |
943 | if(0 == (this->flags() & regex_constants::no_except)) | |
944 | { | |
945 | std::string message = "Invalid lookbehind assertion encountered in the regular expression."; | |
946 | boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); | |
947 | e.raise(); | |
948 | } | |
949 | } | |
950 | BOOST_FALLTHROUGH; | |
951 | default: | |
952 | state = state->next.p; | |
953 | } | |
954 | } | |
955 | ||
956 | // now work through our list, building all the maps as we go: | |
1e59de90 | 957 | while(!v.empty()) |
7c673cae FG |
958 | { |
959 | // Initialize m_recursion_checks if we need it: | |
960 | if(m_has_recursions) | |
b32b8144 | 961 | m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u); |
7c673cae FG |
962 | |
963 | const std::pair<bool, re_syntax_base*>& p = v.back(); | |
964 | m_icase = p.first; | |
965 | state = p.second; | |
966 | v.pop_back(); | |
967 | ||
968 | // Build maps: | |
969 | m_bad_repeats = 0; | |
970 | create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take); | |
971 | m_bad_repeats = 0; | |
972 | ||
973 | if(m_has_recursions) | |
b32b8144 | 974 | m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u); |
7c673cae FG |
975 | create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip); |
976 | // adjust the type of the state to allow for faster matching: | |
977 | state->type = this->get_repeat_type(state); | |
978 | } | |
979 | // restore case sensitivity: | |
980 | m_icase = l_icase; | |
981 | } | |
982 | ||
983 | template <class charT, class traits> | |
984 | int basic_regex_creator<charT, traits>::calculate_backstep(re_syntax_base* state) | |
985 | { | |
986 | typedef typename traits::char_class_type m_type; | |
987 | int result = 0; | |
988 | while(state) | |
989 | { | |
990 | switch(state->type) | |
991 | { | |
992 | case syntax_element_startmark: | |
993 | if((static_cast<re_brace*>(state)->index == -1) | |
994 | || (static_cast<re_brace*>(state)->index == -2)) | |
995 | { | |
996 | state = static_cast<re_jump*>(state->next.p)->alt.p->next.p; | |
997 | continue; | |
998 | } | |
999 | else if(static_cast<re_brace*>(state)->index == -3) | |
1000 | { | |
1001 | state = state->next.p->next.p; | |
1002 | continue; | |
1003 | } | |
1004 | break; | |
1005 | case syntax_element_endmark: | |
1006 | if((static_cast<re_brace*>(state)->index == -1) | |
1007 | || (static_cast<re_brace*>(state)->index == -2)) | |
1008 | return result; | |
1009 | break; | |
1010 | case syntax_element_literal: | |
1011 | result += static_cast<re_literal*>(state)->length; | |
1012 | break; | |
1013 | case syntax_element_wild: | |
1014 | case syntax_element_set: | |
1015 | result += 1; | |
1016 | break; | |
1017 | case syntax_element_dot_rep: | |
1018 | case syntax_element_char_rep: | |
1019 | case syntax_element_short_set_rep: | |
1020 | case syntax_element_backref: | |
1021 | case syntax_element_rep: | |
1022 | case syntax_element_combining: | |
1023 | case syntax_element_long_set_rep: | |
1024 | case syntax_element_backstep: | |
1025 | { | |
1026 | re_repeat* rep = static_cast<re_repeat *>(state); | |
1027 | // adjust the type of the state to allow for faster matching: | |
1028 | state->type = this->get_repeat_type(state); | |
1029 | if((state->type == syntax_element_dot_rep) | |
1030 | || (state->type == syntax_element_char_rep) | |
1031 | || (state->type == syntax_element_short_set_rep)) | |
1032 | { | |
1033 | if(rep->max != rep->min) | |
1034 | return -1; | |
1035 | result += static_cast<int>(rep->min); | |
1036 | state = rep->alt.p; | |
1037 | continue; | |
1038 | } | |
1039 | else if(state->type == syntax_element_long_set_rep) | |
1040 | { | |
1e59de90 | 1041 | BOOST_REGEX_ASSERT(rep->next.p->type == syntax_element_long_set); |
7c673cae FG |
1042 | if(static_cast<re_set_long<m_type>*>(rep->next.p)->singleton == 0) |
1043 | return -1; | |
1044 | if(rep->max != rep->min) | |
1045 | return -1; | |
1046 | result += static_cast<int>(rep->min); | |
1047 | state = rep->alt.p; | |
1048 | continue; | |
1049 | } | |
1050 | } | |
1051 | return -1; | |
1052 | case syntax_element_long_set: | |
1053 | if(static_cast<re_set_long<m_type>*>(state)->singleton == 0) | |
1054 | return -1; | |
1055 | result += 1; | |
1056 | break; | |
1057 | case syntax_element_jump: | |
1058 | state = static_cast<re_jump*>(state)->alt.p; | |
1059 | continue; | |
1060 | case syntax_element_alt: | |
1061 | { | |
1062 | int r1 = calculate_backstep(state->next.p); | |
1063 | int r2 = calculate_backstep(static_cast<re_alt*>(state)->alt.p); | |
1064 | if((r1 < 0) || (r1 != r2)) | |
1065 | return -1; | |
1066 | return result + r1; | |
1067 | } | |
1068 | default: | |
1069 | break; | |
1070 | } | |
1071 | state = state->next.p; | |
1072 | } | |
1073 | return -1; | |
1074 | } | |
1075 | ||
20effc67 TL |
1076 | struct recursion_saver |
1077 | { | |
1078 | std::vector<unsigned char> saved_state; | |
1079 | std::vector<unsigned char>* state; | |
1080 | recursion_saver(std::vector<unsigned char>* p) : saved_state(*p), state(p) {} | |
1081 | ~recursion_saver() | |
1082 | { | |
1083 | state->swap(saved_state); | |
1084 | } | |
1085 | }; | |
1086 | ||
7c673cae FG |
1087 | template <class charT, class traits> |
1088 | void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask) | |
1089 | { | |
20effc67 | 1090 | recursion_saver saved_recursions(&m_recursion_checks); |
7c673cae FG |
1091 | int not_last_jump = 1; |
1092 | re_syntax_base* recursion_start = 0; | |
1093 | int recursion_sub = 0; | |
1094 | re_syntax_base* recursion_restart = 0; | |
1095 | ||
1096 | // track case sensitivity: | |
1097 | bool l_icase = m_icase; | |
1098 | ||
1099 | while(state) | |
1100 | { | |
1101 | switch(state->type) | |
1102 | { | |
1103 | case syntax_element_toggle_case: | |
1104 | l_icase = static_cast<re_case*>(state)->icase; | |
1105 | state = state->next.p; | |
1106 | break; | |
1107 | case syntax_element_literal: | |
1108 | { | |
1109 | // don't set anything in *pnull, set each element in l_map | |
1110 | // that could match the first character in the literal: | |
1111 | if(l_map) | |
1112 | { | |
1113 | l_map[0] |= mask_init; | |
1114 | charT first_char = *static_cast<charT*>(static_cast<void*>(static_cast<re_literal*>(state) + 1)); | |
1115 | for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) | |
1116 | { | |
1117 | if(m_traits.translate(static_cast<charT>(i), l_icase) == first_char) | |
1118 | l_map[i] |= mask; | |
1119 | } | |
1120 | } | |
1121 | return; | |
1122 | } | |
1123 | case syntax_element_end_line: | |
1124 | { | |
1125 | // next character must be a line separator (if there is one): | |
1126 | if(l_map) | |
1127 | { | |
1128 | l_map[0] |= mask_init; | |
1129 | l_map[static_cast<unsigned>('\n')] |= mask; | |
1130 | l_map[static_cast<unsigned>('\r')] |= mask; | |
1131 | l_map[static_cast<unsigned>('\f')] |= mask; | |
1132 | l_map[0x85] |= mask; | |
1133 | } | |
1134 | // now figure out if we can match a NULL string at this point: | |
1135 | if(pnull) | |
1136 | create_startmap(state->next.p, 0, pnull, mask); | |
1137 | return; | |
1138 | } | |
1139 | case syntax_element_recurse: | |
1140 | { | |
1e59de90 | 1141 | BOOST_REGEX_ASSERT(static_cast<const re_jump*>(state)->alt.p->type == syntax_element_startmark); |
b32b8144 FG |
1142 | recursion_sub = static_cast<re_brace*>(static_cast<const re_jump*>(state)->alt.p)->index; |
1143 | if(m_recursion_checks[recursion_sub] & 1u) | |
7c673cae FG |
1144 | { |
1145 | // Infinite recursion!! | |
1146 | if(0 == this->m_pdata->m_status) // update the error code if not already set | |
1147 | this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; | |
1148 | // | |
1149 | // clear the expression, we should be empty: | |
1150 | // | |
1151 | this->m_pdata->m_expression = 0; | |
1152 | this->m_pdata->m_expression_len = 0; | |
1153 | // | |
1154 | // and throw if required: | |
1155 | // | |
1156 | if(0 == (this->flags() & regex_constants::no_except)) | |
1157 | { | |
1158 | std::string message = "Encountered an infinite recursion."; | |
1159 | boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); | |
1160 | e.raise(); | |
1161 | } | |
1162 | } | |
1163 | else if(recursion_start == 0) | |
1164 | { | |
1165 | recursion_start = state; | |
1166 | recursion_restart = state->next.p; | |
1167 | state = static_cast<re_jump*>(state)->alt.p; | |
b32b8144 | 1168 | m_recursion_checks[recursion_sub] |= 1u; |
7c673cae FG |
1169 | break; |
1170 | } | |
b32b8144 | 1171 | m_recursion_checks[recursion_sub] |= 1u; |
7c673cae FG |
1172 | // can't handle nested recursion here... |
1173 | BOOST_FALLTHROUGH; | |
1174 | } | |
1175 | case syntax_element_backref: | |
1176 | // can be null, and any character can match: | |
1177 | if(pnull) | |
1178 | *pnull |= mask; | |
1179 | BOOST_FALLTHROUGH; | |
1180 | case syntax_element_wild: | |
1181 | { | |
1182 | // can't be null, any character can match: | |
1183 | set_all_masks(l_map, mask); | |
1184 | return; | |
1185 | } | |
1186 | case syntax_element_accept: | |
1187 | case syntax_element_match: | |
1188 | { | |
1189 | // must be null, any character can match: | |
1190 | set_all_masks(l_map, mask); | |
1191 | if(pnull) | |
1192 | *pnull |= mask; | |
1193 | return; | |
1194 | } | |
1195 | case syntax_element_word_start: | |
1196 | { | |
1197 | // recurse, then AND with all the word characters: | |
1198 | create_startmap(state->next.p, l_map, pnull, mask); | |
1199 | if(l_map) | |
1200 | { | |
1201 | l_map[0] |= mask_init; | |
1202 | for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) | |
1203 | { | |
1204 | if(!m_traits.isctype(static_cast<charT>(i), m_word_mask)) | |
1205 | l_map[i] &= static_cast<unsigned char>(~mask); | |
1206 | } | |
1207 | } | |
1208 | return; | |
1209 | } | |
1210 | case syntax_element_word_end: | |
1211 | { | |
1212 | // recurse, then AND with all the word characters: | |
1213 | create_startmap(state->next.p, l_map, pnull, mask); | |
1214 | if(l_map) | |
1215 | { | |
1216 | l_map[0] |= mask_init; | |
1217 | for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) | |
1218 | { | |
1219 | if(m_traits.isctype(static_cast<charT>(i), m_word_mask)) | |
1220 | l_map[i] &= static_cast<unsigned char>(~mask); | |
1221 | } | |
1222 | } | |
1223 | return; | |
1224 | } | |
1225 | case syntax_element_buffer_end: | |
1226 | { | |
1227 | // we *must be null* : | |
1228 | if(pnull) | |
1229 | *pnull |= mask; | |
1230 | return; | |
1231 | } | |
1232 | case syntax_element_long_set: | |
1233 | if(l_map) | |
1234 | { | |
1235 | typedef typename traits::char_class_type m_type; | |
1236 | if(static_cast<re_set_long<m_type>*>(state)->singleton) | |
1237 | { | |
1238 | l_map[0] |= mask_init; | |
1239 | for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) | |
1240 | { | |
1241 | charT c = static_cast<charT>(i); | |
1242 | if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<m_type>*>(state), *m_pdata, l_icase)) | |
1243 | l_map[i] |= mask; | |
1244 | } | |
1245 | } | |
1246 | else | |
1247 | set_all_masks(l_map, mask); | |
1248 | } | |
1249 | return; | |
1250 | case syntax_element_set: | |
1251 | if(l_map) | |
1252 | { | |
1253 | l_map[0] |= mask_init; | |
1254 | for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) | |
1255 | { | |
1256 | if(static_cast<re_set*>(state)->_map[ | |
1257 | static_cast<unsigned char>(m_traits.translate(static_cast<charT>(i), l_icase))]) | |
1258 | l_map[i] |= mask; | |
1259 | } | |
1260 | } | |
1261 | return; | |
1262 | case syntax_element_jump: | |
1263 | // take the jump: | |
1264 | state = static_cast<re_alt*>(state)->alt.p; | |
1265 | not_last_jump = -1; | |
1266 | break; | |
1267 | case syntax_element_alt: | |
1268 | case syntax_element_rep: | |
1269 | case syntax_element_dot_rep: | |
1270 | case syntax_element_char_rep: | |
1271 | case syntax_element_short_set_rep: | |
1272 | case syntax_element_long_set_rep: | |
1273 | { | |
1274 | re_alt* rep = static_cast<re_alt*>(state); | |
1275 | if(rep->_map[0] & mask_init) | |
1276 | { | |
1277 | if(l_map) | |
1278 | { | |
1279 | // copy previous results: | |
1280 | l_map[0] |= mask_init; | |
1281 | for(unsigned int i = 0; i <= UCHAR_MAX; ++i) | |
1282 | { | |
1283 | if(rep->_map[i] & mask_any) | |
1284 | l_map[i] |= mask; | |
1285 | } | |
1286 | } | |
1287 | if(pnull) | |
1288 | { | |
1289 | if(rep->can_be_null & mask_any) | |
1290 | *pnull |= mask; | |
1291 | } | |
1292 | } | |
1293 | else | |
1294 | { | |
1295 | // we haven't created a startmap for this alternative yet | |
1296 | // so take the union of the two options: | |
1297 | if(is_bad_repeat(state)) | |
1298 | { | |
1299 | set_all_masks(l_map, mask); | |
1300 | if(pnull) | |
1301 | *pnull |= mask; | |
1302 | return; | |
1303 | } | |
1304 | set_bad_repeat(state); | |
1305 | create_startmap(state->next.p, l_map, pnull, mask); | |
1306 | if((state->type == syntax_element_alt) | |
1307 | || (static_cast<re_repeat*>(state)->min == 0) | |
1308 | || (not_last_jump == 0)) | |
1309 | create_startmap(rep->alt.p, l_map, pnull, mask); | |
1310 | } | |
1311 | } | |
1312 | return; | |
1313 | case syntax_element_soft_buffer_end: | |
1314 | // match newline or null: | |
1315 | if(l_map) | |
1316 | { | |
1317 | l_map[0] |= mask_init; | |
1318 | l_map[static_cast<unsigned>('\n')] |= mask; | |
1319 | l_map[static_cast<unsigned>('\r')] |= mask; | |
1320 | } | |
1321 | if(pnull) | |
1322 | *pnull |= mask; | |
1323 | return; | |
1324 | case syntax_element_endmark: | |
1325 | // need to handle independent subs as a special case: | |
1326 | if(static_cast<re_brace*>(state)->index < 0) | |
1327 | { | |
1328 | // can be null, any character can match: | |
1329 | set_all_masks(l_map, mask); | |
1330 | if(pnull) | |
1331 | *pnull |= mask; | |
1332 | return; | |
1333 | } | |
1334 | else if(recursion_start && (recursion_sub != 0) && (recursion_sub == static_cast<re_brace*>(state)->index)) | |
1335 | { | |
1336 | // recursion termination: | |
1337 | recursion_start = 0; | |
1338 | state = recursion_restart; | |
1339 | break; | |
1340 | } | |
1341 | ||
1342 | // | |
1343 | // Normally we just go to the next state... but if this sub-expression is | |
1344 | // the target of a recursion, then we might be ending a recursion, in which | |
1345 | // case we should check whatever follows that recursion, as well as whatever | |
1346 | // follows this state: | |
1347 | // | |
1348 | if(m_pdata->m_has_recursions && static_cast<re_brace*>(state)->index) | |
1349 | { | |
1350 | bool ok = false; | |
1351 | re_syntax_base* p = m_pdata->m_first_state; | |
1352 | while(p) | |
1353 | { | |
1354 | if(p->type == syntax_element_recurse) | |
1355 | { | |
1356 | re_brace* p2 = static_cast<re_brace*>(static_cast<re_jump*>(p)->alt.p); | |
1357 | if((p2->type == syntax_element_startmark) && (p2->index == static_cast<re_brace*>(state)->index)) | |
1358 | { | |
1359 | ok = true; | |
1360 | break; | |
1361 | } | |
1362 | } | |
1363 | p = p->next.p; | |
1364 | } | |
b32b8144 | 1365 | if(ok && ((m_recursion_checks[static_cast<re_brace*>(state)->index] & 2u) == 0)) |
7c673cae | 1366 | { |
b32b8144 | 1367 | m_recursion_checks[static_cast<re_brace*>(state)->index] |= 2u; |
7c673cae FG |
1368 | create_startmap(p->next.p, l_map, pnull, mask); |
1369 | } | |
1370 | } | |
1371 | state = state->next.p; | |
1372 | break; | |
1373 | ||
1374 | case syntax_element_commit: | |
1375 | set_all_masks(l_map, mask); | |
1376 | // Continue scanning so we can figure out whether we can be null: | |
1377 | state = state->next.p; | |
1378 | break; | |
1379 | case syntax_element_startmark: | |
1380 | // need to handle independent subs as a special case: | |
1381 | if(static_cast<re_brace*>(state)->index == -3) | |
1382 | { | |
1383 | state = state->next.p->next.p; | |
1384 | break; | |
1385 | } | |
1386 | BOOST_FALLTHROUGH; | |
1387 | default: | |
1388 | state = state->next.p; | |
1389 | } | |
1390 | ++not_last_jump; | |
1391 | } | |
1392 | } | |
1393 | ||
1394 | template <class charT, class traits> | |
1395 | unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* state) | |
1396 | { | |
1397 | // | |
1398 | // find out how the machine starts, so we can optimise the search: | |
1399 | // | |
1400 | while(state) | |
1401 | { | |
1402 | switch(state->type) | |
1403 | { | |
1404 | case syntax_element_startmark: | |
1405 | case syntax_element_endmark: | |
1406 | state = state->next.p; | |
1407 | continue; | |
1408 | case syntax_element_start_line: | |
1409 | return regbase::restart_line; | |
1410 | case syntax_element_word_start: | |
1411 | return regbase::restart_word; | |
1412 | case syntax_element_buffer_start: | |
1413 | return regbase::restart_buf; | |
1414 | case syntax_element_restart_continue: | |
1415 | return regbase::restart_continue; | |
1416 | default: | |
1417 | state = 0; | |
1418 | continue; | |
1419 | } | |
1420 | } | |
1421 | return regbase::restart_any; | |
1422 | } | |
1423 | ||
1424 | template <class charT, class traits> | |
1425 | void basic_regex_creator<charT, traits>::set_all_masks(unsigned char* bits, unsigned char mask) | |
1426 | { | |
1427 | // | |
1428 | // set mask in all of bits elements, | |
1429 | // if bits[0] has mask_init not set then we can | |
1430 | // optimise this to a call to memset: | |
1431 | // | |
1432 | if(bits) | |
1433 | { | |
1434 | if(bits[0] == 0) | |
1435 | (std::memset)(bits, mask, 1u << CHAR_BIT); | |
1436 | else | |
1437 | { | |
1438 | for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) | |
1439 | bits[i] |= mask; | |
1440 | } | |
1441 | bits[0] |= mask_init; | |
1442 | } | |
1443 | } | |
1444 | ||
1445 | template <class charT, class traits> | |
1446 | bool basic_regex_creator<charT, traits>::is_bad_repeat(re_syntax_base* pt) | |
1447 | { | |
1448 | switch(pt->type) | |
1449 | { | |
1450 | case syntax_element_rep: | |
1451 | case syntax_element_dot_rep: | |
1452 | case syntax_element_char_rep: | |
1453 | case syntax_element_short_set_rep: | |
1454 | case syntax_element_long_set_rep: | |
1455 | { | |
1456 | unsigned state_id = static_cast<re_repeat*>(pt)->state_id; | |
b32b8144 | 1457 | if(state_id >= sizeof(m_bad_repeats) * CHAR_BIT) |
7c673cae FG |
1458 | return true; // run out of bits, assume we can't traverse this one. |
1459 | static const boost::uintmax_t one = 1uL; | |
1460 | return m_bad_repeats & (one << state_id); | |
1461 | } | |
1462 | default: | |
1463 | return false; | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | template <class charT, class traits> | |
1468 | void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt) | |
1469 | { | |
1470 | switch(pt->type) | |
1471 | { | |
1472 | case syntax_element_rep: | |
1473 | case syntax_element_dot_rep: | |
1474 | case syntax_element_char_rep: | |
1475 | case syntax_element_short_set_rep: | |
1476 | case syntax_element_long_set_rep: | |
1477 | { | |
1478 | unsigned state_id = static_cast<re_repeat*>(pt)->state_id; | |
1479 | static const boost::uintmax_t one = 1uL; | |
1480 | if(state_id <= sizeof(m_bad_repeats) * CHAR_BIT) | |
1481 | m_bad_repeats |= (one << state_id); | |
1482 | } | |
1483 | break; | |
1484 | default: | |
1485 | break; | |
1486 | } | |
1487 | } | |
1488 | ||
1489 | template <class charT, class traits> | |
1490 | syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state) | |
1491 | { | |
1492 | typedef typename traits::char_class_type m_type; | |
1493 | if(state->type == syntax_element_rep) | |
1494 | { | |
1495 | // check to see if we are repeating a single state: | |
1496 | if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p) | |
1497 | { | |
1498 | switch(state->next.p->type) | |
1499 | { | |
1500 | case BOOST_REGEX_DETAIL_NS::syntax_element_wild: | |
1501 | return BOOST_REGEX_DETAIL_NS::syntax_element_dot_rep; | |
1502 | case BOOST_REGEX_DETAIL_NS::syntax_element_literal: | |
1503 | return BOOST_REGEX_DETAIL_NS::syntax_element_char_rep; | |
1504 | case BOOST_REGEX_DETAIL_NS::syntax_element_set: | |
1505 | return BOOST_REGEX_DETAIL_NS::syntax_element_short_set_rep; | |
1506 | case BOOST_REGEX_DETAIL_NS::syntax_element_long_set: | |
1507 | if(static_cast<BOOST_REGEX_DETAIL_NS::re_set_long<m_type>*>(state->next.p)->singleton) | |
1508 | return BOOST_REGEX_DETAIL_NS::syntax_element_long_set_rep; | |
1509 | break; | |
1510 | default: | |
1511 | break; | |
1512 | } | |
1513 | } | |
1514 | } | |
1515 | return state->type; | |
1516 | } | |
1517 | ||
1518 | template <class charT, class traits> | |
1519 | void basic_regex_creator<charT, traits>::probe_leading_repeat(re_syntax_base* state) | |
1520 | { | |
1521 | // enumerate our states, and see if we have a leading repeat | |
1e59de90 | 1522 | // for which failed search restarts can be optimized; |
7c673cae FG |
1523 | do |
1524 | { | |
1525 | switch(state->type) | |
1526 | { | |
1527 | case syntax_element_startmark: | |
1528 | if(static_cast<re_brace*>(state)->index >= 0) | |
1529 | { | |
1530 | state = state->next.p; | |
1531 | continue; | |
1532 | } | |
20effc67 TL |
1533 | #ifdef BOOST_MSVC |
1534 | # pragma warning(push) | |
1535 | #pragma warning(disable:6011) | |
1536 | #endif | |
7c673cae FG |
1537 | if((static_cast<re_brace*>(state)->index == -1) |
1538 | || (static_cast<re_brace*>(state)->index == -2)) | |
1539 | { | |
1540 | // skip past the zero width assertion: | |
1541 | state = static_cast<const re_jump*>(state->next.p)->alt.p->next.p; | |
1542 | continue; | |
1543 | } | |
20effc67 TL |
1544 | #ifdef BOOST_MSVC |
1545 | # pragma warning(pop) | |
1546 | #endif | |
7c673cae FG |
1547 | if(static_cast<re_brace*>(state)->index == -3) |
1548 | { | |
1549 | // Have to skip the leading jump state: | |
1550 | state = state->next.p->next.p; | |
1551 | continue; | |
1552 | } | |
1553 | return; | |
1554 | case syntax_element_endmark: | |
1555 | case syntax_element_start_line: | |
1556 | case syntax_element_end_line: | |
1557 | case syntax_element_word_boundary: | |
1558 | case syntax_element_within_word: | |
1559 | case syntax_element_word_start: | |
1560 | case syntax_element_word_end: | |
1561 | case syntax_element_buffer_start: | |
1562 | case syntax_element_buffer_end: | |
1563 | case syntax_element_restart_continue: | |
1564 | state = state->next.p; | |
1565 | break; | |
1566 | case syntax_element_dot_rep: | |
1567 | case syntax_element_char_rep: | |
1568 | case syntax_element_short_set_rep: | |
1569 | case syntax_element_long_set_rep: | |
1570 | if(this->m_has_backrefs == 0) | |
1571 | static_cast<re_repeat*>(state)->leading = true; | |
1572 | BOOST_FALLTHROUGH; | |
1573 | default: | |
1574 | return; | |
1575 | } | |
1576 | }while(state); | |
1577 | } | |
1578 | ||
7c673cae FG |
1579 | } // namespace BOOST_REGEX_DETAIL_NS |
1580 | ||
1581 | } // namespace boost | |
1582 | ||
1583 | #ifdef BOOST_MSVC | |
1584 | # pragma warning(pop) | |
1585 | #endif | |
1586 | ||
1587 | #ifdef BOOST_MSVC | |
1588 | #pragma warning(push) | |
1589 | #pragma warning(disable: 4103) | |
1590 | #endif | |
1591 | #ifdef BOOST_HAS_ABI_HEADERS | |
1592 | # include BOOST_ABI_SUFFIX | |
1593 | #endif | |
1594 | #ifdef BOOST_MSVC | |
1595 | #pragma warning(pop) | |
1596 | #endif | |
1597 | ||
1598 | #endif |