]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////////////////////////////////////////////////////////////////////////// |
2 | // detail/dynamic/parser_traits.hpp | |
3 | // | |
4 | // Copyright 2008 Eric Niebler. Distributed under the Boost | |
5 | // Software License, Version 1.0. (See accompanying file | |
6 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
7 | ||
8 | #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005 | |
9 | #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005 | |
10 | ||
11 | // MS compatible compilers support #pragma once | |
12 | #if defined(_MSC_VER) | |
13 | # pragma once | |
14 | #endif | |
15 | ||
16 | #include <string> | |
17 | #include <climits> | |
18 | #include <boost/config.hpp> | |
19 | #include <boost/assert.hpp> | |
20 | #include <boost/throw_exception.hpp> | |
21 | #include <boost/xpressive/regex_error.hpp> | |
22 | #include <boost/xpressive/regex_traits.hpp> | |
23 | #include <boost/xpressive/detail/detail_fwd.hpp> | |
24 | #include <boost/xpressive/detail/dynamic/matchable.hpp> | |
25 | #include <boost/xpressive/detail/dynamic/parser_enum.hpp> | |
26 | #include <boost/xpressive/detail/utility/literals.hpp> | |
27 | #include <boost/xpressive/detail/utility/algorithm.hpp> | |
28 | ||
29 | namespace boost { namespace xpressive | |
30 | { | |
31 | ||
32 | /////////////////////////////////////////////////////////////////////////////// | |
33 | // compiler_traits | |
34 | // this works for char and wchar_t. it must be specialized for anything else. | |
35 | // | |
36 | template<typename RegexTraits> | |
37 | struct compiler_traits | |
38 | { | |
39 | typedef RegexTraits regex_traits; | |
40 | typedef typename regex_traits::char_type char_type; | |
41 | typedef typename regex_traits::string_type string_type; | |
42 | typedef typename regex_traits::locale_type locale_type; | |
43 | ||
44 | /////////////////////////////////////////////////////////////////////////////// | |
45 | // constructor | |
46 | explicit compiler_traits(RegexTraits const &traits = RegexTraits()) | |
47 | : traits_(traits) | |
48 | , flags_(regex_constants::ECMAScript) | |
49 | , space_(lookup_classname(traits_, "space")) | |
50 | , alnum_(lookup_classname(traits_, "alnum")) | |
51 | { | |
52 | } | |
53 | ||
54 | /////////////////////////////////////////////////////////////////////////////// | |
55 | // flags | |
56 | regex_constants::syntax_option_type flags() const | |
57 | { | |
58 | return this->flags_; | |
59 | } | |
60 | ||
61 | /////////////////////////////////////////////////////////////////////////////// | |
62 | // flags | |
63 | void flags(regex_constants::syntax_option_type flags) | |
64 | { | |
65 | this->flags_ = flags; | |
66 | } | |
67 | ||
68 | /////////////////////////////////////////////////////////////////////////////// | |
69 | // traits | |
70 | regex_traits &traits() | |
71 | { | |
72 | return this->traits_; | |
73 | } | |
74 | ||
75 | regex_traits const &traits() const | |
76 | { | |
77 | return this->traits_; | |
78 | } | |
79 | ||
80 | /////////////////////////////////////////////////////////////////////////////// | |
81 | // imbue | |
82 | locale_type imbue(locale_type const &loc) | |
83 | { | |
84 | locale_type oldloc = this->traits().imbue(loc); | |
85 | this->space_ = lookup_classname(this->traits(), "space"); | |
86 | this->alnum_ = lookup_classname(this->traits(), "alnum"); | |
87 | return oldloc; | |
88 | } | |
89 | ||
90 | /////////////////////////////////////////////////////////////////////////////// | |
91 | // getloc | |
92 | locale_type getloc() const | |
93 | { | |
94 | return this->traits().getloc(); | |
95 | } | |
96 | ||
97 | /////////////////////////////////////////////////////////////////////////////// | |
98 | // get_token | |
99 | // get a token and advance the iterator | |
100 | template<typename FwdIter> | |
101 | regex_constants::compiler_token_type get_token(FwdIter &begin, FwdIter end) | |
102 | { | |
103 | using namespace regex_constants; | |
104 | if(this->eat_ws_(begin, end) == end) | |
105 | { | |
106 | return regex_constants::token_end_of_pattern; | |
107 | } | |
108 | ||
109 | switch(*begin) | |
110 | { | |
111 | case BOOST_XPR_CHAR_(char_type, '\\'): return this->get_escape_token(++begin, end); | |
112 | case BOOST_XPR_CHAR_(char_type, '.'): ++begin; return token_any; | |
113 | case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_assert_begin_line; | |
114 | case BOOST_XPR_CHAR_(char_type, '$'): ++begin; return token_assert_end_line; | |
115 | case BOOST_XPR_CHAR_(char_type, '('): ++begin; return token_group_begin; | |
116 | case BOOST_XPR_CHAR_(char_type, ')'): ++begin; return token_group_end; | |
117 | case BOOST_XPR_CHAR_(char_type, '|'): ++begin; return token_alternate; | |
118 | case BOOST_XPR_CHAR_(char_type, '['): ++begin; return token_charset_begin; | |
119 | ||
120 | case BOOST_XPR_CHAR_(char_type, '*'): | |
121 | case BOOST_XPR_CHAR_(char_type, '+'): | |
122 | case BOOST_XPR_CHAR_(char_type, '?'): | |
123 | return token_invalid_quantifier; | |
124 | ||
125 | case BOOST_XPR_CHAR_(char_type, ']'): | |
126 | case BOOST_XPR_CHAR_(char_type, '{'): | |
127 | default: | |
128 | return token_literal; | |
129 | } | |
130 | } | |
131 | ||
132 | /////////////////////////////////////////////////////////////////////////////// | |
133 | // get_quant_spec | |
134 | template<typename FwdIter> | |
135 | bool get_quant_spec(FwdIter &begin, FwdIter end, detail::quant_spec &spec) | |
136 | { | |
137 | using namespace regex_constants; | |
138 | FwdIter old_begin; | |
139 | ||
140 | if(this->eat_ws_(begin, end) == end) | |
141 | { | |
142 | return false; | |
143 | } | |
144 | ||
145 | switch(*begin) | |
146 | { | |
147 | case BOOST_XPR_CHAR_(char_type, '*'): | |
148 | spec.min_ = 0; | |
149 | spec.max_ = (std::numeric_limits<unsigned int>::max)(); | |
150 | break; | |
151 | ||
152 | case BOOST_XPR_CHAR_(char_type, '+'): | |
153 | spec.min_ = 1; | |
154 | spec.max_ = (std::numeric_limits<unsigned int>::max)(); | |
155 | break; | |
156 | ||
157 | case BOOST_XPR_CHAR_(char_type, '?'): | |
158 | spec.min_ = 0; | |
159 | spec.max_ = 1; | |
160 | break; | |
161 | ||
162 | case BOOST_XPR_CHAR_(char_type, '{'): | |
163 | old_begin = this->eat_ws_(++begin, end); | |
164 | spec.min_ = spec.max_ = detail::toi(begin, end, this->traits()); | |
165 | BOOST_XPR_ENSURE_ | |
166 | ( | |
167 | begin != old_begin && begin != end, error_brace, "invalid quantifier" | |
168 | ); | |
169 | ||
170 | if(*begin == BOOST_XPR_CHAR_(char_type, ',')) | |
171 | { | |
172 | old_begin = this->eat_ws_(++begin, end); | |
173 | spec.max_ = detail::toi(begin, end, this->traits()); | |
174 | BOOST_XPR_ENSURE_ | |
175 | ( | |
176 | begin != end && BOOST_XPR_CHAR_(char_type, '}') == *begin | |
177 | , error_brace, "invalid quantifier" | |
178 | ); | |
179 | ||
180 | if(begin == old_begin) | |
181 | { | |
182 | spec.max_ = (std::numeric_limits<unsigned int>::max)(); | |
183 | } | |
184 | else | |
185 | { | |
186 | BOOST_XPR_ENSURE_ | |
187 | ( | |
188 | spec.min_ <= spec.max_, error_badbrace, "invalid quantification range" | |
189 | ); | |
190 | } | |
191 | } | |
192 | else | |
193 | { | |
194 | BOOST_XPR_ENSURE_ | |
195 | ( | |
196 | BOOST_XPR_CHAR_(char_type, '}') == *begin, error_brace, "invalid quantifier" | |
197 | ); | |
198 | } | |
199 | break; | |
200 | ||
201 | default: | |
202 | return false; | |
203 | } | |
204 | ||
205 | spec.greedy_ = true; | |
206 | if(this->eat_ws_(++begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin) | |
207 | { | |
208 | ++begin; | |
209 | spec.greedy_ = false; | |
210 | } | |
211 | ||
212 | return true; | |
213 | } | |
214 | ||
215 | /////////////////////////////////////////////////////////////////////////// | |
216 | // get_group_type | |
217 | template<typename FwdIter> | |
218 | regex_constants::compiler_token_type get_group_type(FwdIter &begin, FwdIter end, string_type &name) | |
219 | { | |
220 | using namespace regex_constants; | |
221 | if(this->eat_ws_(begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin) | |
222 | { | |
223 | this->eat_ws_(++begin, end); | |
224 | BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); | |
225 | ||
226 | switch(*begin) | |
227 | { | |
228 | case BOOST_XPR_CHAR_(char_type, ':'): ++begin; return token_no_mark; | |
229 | case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_independent_sub_expression; | |
230 | case BOOST_XPR_CHAR_(char_type, '#'): ++begin; return token_comment; | |
231 | case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookahead; | |
232 | case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookahead; | |
233 | case BOOST_XPR_CHAR_(char_type, 'R'): ++begin; return token_recurse; | |
234 | case BOOST_XPR_CHAR_(char_type, '$'): | |
235 | this->get_name_(++begin, end, name); | |
236 | BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); | |
237 | if(BOOST_XPR_CHAR_(char_type, '=') == *begin) | |
238 | { | |
239 | ++begin; | |
240 | return token_rule_assign; | |
241 | } | |
242 | return token_rule_ref; | |
243 | ||
244 | case BOOST_XPR_CHAR_(char_type, '<'): | |
245 | this->eat_ws_(++begin, end); | |
246 | BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); | |
247 | switch(*begin) | |
248 | { | |
249 | case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookbehind; | |
250 | case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookbehind; | |
251 | default: | |
252 | BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); | |
253 | } | |
254 | ||
255 | case BOOST_XPR_CHAR_(char_type, 'P'): | |
256 | this->eat_ws_(++begin, end); | |
257 | BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); | |
258 | switch(*begin) | |
259 | { | |
260 | case BOOST_XPR_CHAR_(char_type, '<'): | |
261 | this->get_name_(++begin, end, name); | |
262 | BOOST_XPR_ENSURE_(begin != end && BOOST_XPR_CHAR_(char_type, '>') == *begin++, error_paren, "incomplete extension"); | |
263 | return token_named_mark; | |
264 | case BOOST_XPR_CHAR_(char_type, '='): | |
265 | this->get_name_(++begin, end, name); | |
266 | BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension"); | |
267 | return token_named_mark_ref; | |
268 | default: | |
269 | BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); | |
270 | } | |
271 | ||
272 | case BOOST_XPR_CHAR_(char_type, 'i'): | |
273 | case BOOST_XPR_CHAR_(char_type, 'm'): | |
274 | case BOOST_XPR_CHAR_(char_type, 's'): | |
275 | case BOOST_XPR_CHAR_(char_type, 'x'): | |
276 | case BOOST_XPR_CHAR_(char_type, '-'): | |
277 | return this->parse_mods_(begin, end); | |
278 | ||
279 | default: | |
280 | BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension")); | |
281 | } | |
282 | } | |
283 | ||
284 | return token_literal; | |
285 | } | |
286 | ||
287 | ////////////////////////////////////////////////////////////////////////// | |
288 | // get_charset_token | |
289 | // NOTE: white-space is *never* ignored in a charset. | |
290 | template<typename FwdIter> | |
291 | regex_constants::compiler_token_type get_charset_token(FwdIter &begin, FwdIter end) | |
292 | { | |
293 | using namespace regex_constants; | |
294 | BOOST_ASSERT(begin != end); | |
295 | switch(*begin) | |
296 | { | |
297 | case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_charset_invert; | |
298 | case BOOST_XPR_CHAR_(char_type, '-'): ++begin; return token_charset_hyphen; | |
299 | case BOOST_XPR_CHAR_(char_type, ']'): ++begin; return token_charset_end; | |
300 | case BOOST_XPR_CHAR_(char_type, '['): | |
301 | { | |
302 | FwdIter next = begin; ++next; | |
303 | if(next != end) | |
304 | { | |
305 | BOOST_XPR_ENSURE_( | |
306 | *next != BOOST_XPR_CHAR_(char_type, '=') | |
307 | , error_collate | |
308 | , "equivalence classes are not yet supported" | |
309 | ); | |
310 | ||
311 | BOOST_XPR_ENSURE_( | |
312 | *next != BOOST_XPR_CHAR_(char_type, '.') | |
313 | , error_collate | |
314 | , "collation sequences are not yet supported" | |
315 | ); | |
316 | ||
317 | if(*next == BOOST_XPR_CHAR_(char_type, ':')) | |
318 | { | |
319 | begin = ++next; | |
320 | return token_posix_charset_begin; | |
321 | } | |
322 | } | |
323 | } | |
324 | break; | |
325 | case BOOST_XPR_CHAR_(char_type, ':'): | |
326 | { | |
327 | FwdIter next = begin; ++next; | |
328 | if(next != end && *next == BOOST_XPR_CHAR_(char_type, ']')) | |
329 | { | |
330 | begin = ++next; | |
331 | return token_posix_charset_end; | |
332 | } | |
333 | } | |
334 | break; | |
335 | case BOOST_XPR_CHAR_(char_type, '\\'): | |
336 | if(++begin != end) | |
337 | { | |
338 | switch(*begin) | |
339 | { | |
340 | case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_charset_backspace; | |
341 | default:; | |
342 | } | |
343 | } | |
344 | return token_escape; | |
345 | default:; | |
346 | } | |
347 | return token_literal; | |
348 | } | |
349 | ||
350 | ////////////////////////////////////////////////////////////////////////// | |
351 | // get_escape_token | |
352 | template<typename FwdIter> | |
353 | regex_constants::compiler_token_type get_escape_token(FwdIter &begin, FwdIter end) | |
354 | { | |
355 | using namespace regex_constants; | |
356 | if(begin != end) | |
357 | { | |
358 | switch(*begin) | |
359 | { | |
360 | //case BOOST_XPR_CHAR_(char_type, 'a'): ++begin; return token_escape_bell; | |
361 | //case BOOST_XPR_CHAR_(char_type, 'c'): ++begin; return token_escape_control; | |
362 | //case BOOST_XPR_CHAR_(char_type, 'e'): ++begin; return token_escape_escape; | |
363 | //case BOOST_XPR_CHAR_(char_type, 'f'): ++begin; return token_escape_formfeed; | |
364 | //case BOOST_XPR_CHAR_(char_type, 'n'): ++begin; return token_escape_newline; | |
365 | //case BOOST_XPR_CHAR_(char_type, 't'): ++begin; return token_escape_horizontal_tab; | |
366 | //case BOOST_XPR_CHAR_(char_type, 'v'): ++begin; return token_escape_vertical_tab; | |
367 | case BOOST_XPR_CHAR_(char_type, 'A'): ++begin; return token_assert_begin_sequence; | |
368 | case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_assert_word_boundary; | |
369 | case BOOST_XPR_CHAR_(char_type, 'B'): ++begin; return token_assert_not_word_boundary; | |
370 | case BOOST_XPR_CHAR_(char_type, 'E'): ++begin; return token_quote_meta_end; | |
371 | case BOOST_XPR_CHAR_(char_type, 'Q'): ++begin; return token_quote_meta_begin; | |
372 | case BOOST_XPR_CHAR_(char_type, 'Z'): ++begin; return token_assert_end_sequence; | |
373 | // Non-standard extension to ECMAScript syntax | |
374 | case BOOST_XPR_CHAR_(char_type, '<'): ++begin; return token_assert_word_begin; | |
375 | case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_assert_word_end; | |
376 | default:; // fall-through | |
377 | } | |
378 | } | |
379 | ||
380 | return token_escape; | |
381 | } | |
382 | ||
383 | private: | |
384 | ||
385 | ////////////////////////////////////////////////////////////////////////// | |
386 | // parse_mods_ | |
387 | template<typename FwdIter> | |
388 | regex_constants::compiler_token_type parse_mods_(FwdIter &begin, FwdIter end) | |
389 | { | |
390 | using namespace regex_constants; | |
391 | bool set = true; | |
392 | do switch(*begin) | |
393 | { | |
394 | case BOOST_XPR_CHAR_(char_type, 'i'): this->flag_(set, icase_); break; | |
395 | case BOOST_XPR_CHAR_(char_type, 'm'): this->flag_(!set, single_line); break; | |
396 | case BOOST_XPR_CHAR_(char_type, 's'): this->flag_(!set, not_dot_newline); break; | |
397 | case BOOST_XPR_CHAR_(char_type, 'x'): this->flag_(set, ignore_white_space); break; | |
398 | case BOOST_XPR_CHAR_(char_type, ':'): ++begin; BOOST_FALLTHROUGH; | |
399 | case BOOST_XPR_CHAR_(char_type, ')'): return token_no_mark; | |
400 | case BOOST_XPR_CHAR_(char_type, '-'): if(false == (set = !set)) break; BOOST_FALLTHROUGH; | |
401 | default: BOOST_THROW_EXCEPTION(regex_error(error_paren, "unknown pattern modifier")); | |
402 | } | |
403 | while(BOOST_XPR_ENSURE_(++begin != end, error_paren, "incomplete extension")); | |
404 | // this return is technically unreachable, but this must | |
405 | // be here to work around a bug in gcc 4.0 | |
406 | return token_no_mark; | |
407 | } | |
408 | ||
409 | /////////////////////////////////////////////////////////////////////////////// | |
410 | // flag_ | |
411 | void flag_(bool set, regex_constants::syntax_option_type flag) | |
412 | { | |
413 | this->flags_ = set ? (this->flags_ | flag) : (this->flags_ & ~flag); | |
414 | } | |
415 | ||
416 | /////////////////////////////////////////////////////////////////////////// | |
417 | // is_space_ | |
418 | bool is_space_(char_type ch) const | |
419 | { | |
420 | return 0 != this->space_ && this->traits().isctype(ch, this->space_); | |
421 | } | |
422 | ||
423 | /////////////////////////////////////////////////////////////////////////// | |
424 | // is_alnum_ | |
425 | bool is_alnum_(char_type ch) const | |
426 | { | |
427 | return 0 != this->alnum_ && this->traits().isctype(ch, this->alnum_); | |
428 | } | |
429 | ||
430 | /////////////////////////////////////////////////////////////////////////// | |
431 | // get_name_ | |
432 | template<typename FwdIter> | |
433 | void get_name_(FwdIter &begin, FwdIter end, string_type &name) | |
434 | { | |
435 | this->eat_ws_(begin, end); | |
436 | for(name.clear(); begin != end && this->is_alnum_(*begin); ++begin) | |
437 | { | |
438 | name.push_back(*begin); | |
439 | } | |
440 | this->eat_ws_(begin, end); | |
441 | BOOST_XPR_ENSURE_(!name.empty(), regex_constants::error_paren, "incomplete extension"); | |
442 | } | |
443 | ||
444 | /////////////////////////////////////////////////////////////////////////////// | |
445 | // eat_ws_ | |
446 | template<typename FwdIter> | |
447 | FwdIter &eat_ws_(FwdIter &begin, FwdIter end) | |
448 | { | |
449 | if(0 != (regex_constants::ignore_white_space & this->flags())) | |
450 | { | |
451 | while(end != begin && (BOOST_XPR_CHAR_(char_type, '#') == *begin || this->is_space_(*begin))) | |
452 | { | |
453 | if(BOOST_XPR_CHAR_(char_type, '#') == *begin++) | |
454 | { | |
455 | while(end != begin && BOOST_XPR_CHAR_(char_type, '\n') != *begin++) {} | |
456 | } | |
457 | else | |
458 | { | |
459 | for(; end != begin && this->is_space_(*begin); ++begin) {} | |
460 | } | |
461 | } | |
462 | } | |
463 | ||
464 | return begin; | |
465 | } | |
466 | ||
467 | regex_traits traits_; | |
468 | regex_constants::syntax_option_type flags_; | |
469 | typename regex_traits::char_class_type space_; | |
470 | typename regex_traits::char_class_type alnum_; | |
471 | }; | |
472 | ||
473 | }} // namespace boost::xpressive | |
474 | ||
475 | #endif |