2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_PARSER_HPP
7 #define BOOST_LEXER_PARSER_HPP
9 #include <boost/assert.hpp>
10 #include "tree/end_node.hpp"
11 #include "tree/iteration_node.hpp"
12 #include "tree/leaf_node.hpp"
13 #include "../runtime_error.hpp"
14 #include "tree/selection_node.hpp"
15 #include "tree/sequence_node.hpp"
16 #include "../size_t.hpp"
17 #include "tokeniser/re_tokeniser.hpp"
25 template<typename CharT>
29 typedef basic_re_tokeniser<CharT> tokeniser;
30 typedef typename tokeniser::string string;
31 typedef std::map<string, const node *> macro_map;
32 typedef node::node_ptr_vector node_ptr_vector;
33 typedef typename tokeniser::num_token token;
36 General principles of regex parsing:
37 - Every regex is a sequence of sub-regexes.
38 - Regexes consist of operands and operators
39 - All operators decompose to sequence, selection ('|') and iteration ('*')
40 - Regex tokens are stored on the stack.
41 - When a complete sequence of regex tokens is on the stack it is processed.
46 <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
48 <SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
49 <EXPRESSION> -> <REPEAT>
50 <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
51 <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'
53 static node *parse (const CharT *start_, const CharT * const end_,
54 const std::size_t id_, const std::size_t unique_id_,
55 const std::size_t dfa_state_, const regex_flags flags_,
56 const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
57 const macro_map ¯omap_, typename tokeniser::token_map &map_,
58 bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
61 state state_ (start_, end_, flags_, locale_);
64 token_stack token_stack_;
65 tree_node_stack tree_node_stack_;
68 token_stack_.push (rhs_token_);
69 tokeniser::next (state_, map_, rhs_token_);
73 lhs_token_ = token_stack_.top ();
74 action_ = lhs_token_.precedence (rhs_token_._type);
80 token_stack_.push (rhs_token_);
81 tokeniser::next (state_, map_, rhs_token_);
84 reduce (token_stack_, macromap_, node_ptr_vector_,
88 std::ostringstream ss_;
90 ss_ << "A syntax error occurred: '" <<
91 lhs_token_.precedence_string () <<
92 "' against '" << rhs_token_.precedence_string () <<
93 "' at index " << state_.index () << ".";
94 throw runtime_error (ss_.str ().c_str ());
97 } while (!token_stack_.empty ());
99 if (tree_node_stack_.empty ())
101 throw runtime_error ("Empty rules are not allowed.");
104 BOOST_ASSERT(tree_node_stack_.size () == 1);
106 node *lhs_node_ = tree_node_stack_.top ();
108 tree_node_stack_.pop ();
112 // Macros have no end state...
117 node_ptr_vector_->push_back (static_cast<end_node *>(0));
119 node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
121 node_ptr_vector_->back () = rhs_node_;
122 node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
123 node_ptr_vector_->back () = new sequence_node
124 (lhs_node_, rhs_node_);
125 root_ = node_ptr_vector_->back ();
128 // Done this way as bug in VC++ 6 prevents |= operator working
130 if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
132 if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
138 typedef typename tokeniser::state state;
139 typedef std::stack<token> token_stack;
140 typedef node::node_stack tree_node_stack;
142 static void reduce (token_stack &token_stack_,
143 const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_,
144 tree_node_stack &tree_node_stack_)
146 typename tokeniser::num_token lhs_;
147 typename tokeniser::num_token rhs_;
153 rhs_ = token_stack_.top ();
157 if (!token_stack_.empty ())
159 lhs_ = token_stack_.top ();
160 action_ = lhs_.precedence (rhs_._type);
162 } while (!token_stack_.empty () && action_ == '=');
164 BOOST_ASSERT(token_stack_.empty () || action_ == '<');
169 // finished processing so exit
172 // finished parsing, nothing to do
175 orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
177 case token::SEQUENCE:
178 token_stack_.push (token::OREXP);
181 sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
183 case token::EXPRESSION:
184 token_stack_.push (token::SUB);
187 repeat (handle_, token_stack_);
190 charset (handle_, token_stack_, node_vector_ptr_,
194 macro (handle_, token_stack_, macromap_, node_vector_ptr_,
197 case token::OPENPAREN:
198 openparen (handle_, token_stack_);
202 optional (rhs_._type == token::OPT, node_vector_ptr_,
204 token_stack_.push (token::DUP);
206 case token::ZEROORMORE:
207 case token::AZEROORMORE:
208 zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
210 token_stack_.push (token::DUP);
212 case token::ONEORMORE:
213 case token::AONEORMORE:
214 one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
216 token_stack_.push (token::DUP);
219 case token::AREPEATN:
220 repeatn (rhs_._type == token::REPEATN, handle_.top (),
221 node_vector_ptr_, tree_node_stack_);
222 token_stack_.push (token::DUP);
226 ("Internal error regex_parser::reduce");
231 static void orexp (token_stack &handle_, token_stack &token_stack_,
232 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
234 BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
235 (handle_.size () == 1 || handle_.size () == 3));
237 if (handle_.size () == 1)
239 token_stack_.push (token::REGEX);
244 BOOST_ASSERT(handle_.top ()._type == token::OR);
246 BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
247 perform_or (node_ptr_vector_, tree_node_stack_);
248 token_stack_.push (token::OREXP);
252 static void sub (token_stack &handle_, token_stack &token_stack_,
253 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
255 BOOST_ASSERT(handle_.top ()._type == token::SUB &&
256 (handle_.size () == 1 || handle_.size () == 2));
258 if (handle_.size () == 1)
260 token_stack_.push (token::SEQUENCE);
265 BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
267 sequence (node_ptr_vector_, tree_node_stack_);
268 token_stack_.push (token::SUB);
272 static void repeat (token_stack &handle_, token_stack &token_stack_)
274 BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
275 handle_.size () >= 1 && handle_.size () <= 3);
277 if (handle_.size () == 1)
279 token_stack_.push (token::EXPRESSION);
284 BOOST_ASSERT(handle_.top ()._type == token::DUP);
285 token_stack_.push (token::REPEAT);
289 static void charset (token_stack &handle_, token_stack &token_stack_,
290 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
292 BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
293 handle_.size () == 1);
295 node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
297 const size_t id_ = handle_.top ()._id;
299 node_ptr_vector_->back () = new leaf_node (id_, true);
300 tree_node_stack_.push (node_ptr_vector_->back ());
301 token_stack_.push (token::REPEAT);
304 static void macro (token_stack &handle_, token_stack &token_stack_,
305 const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_,
306 tree_node_stack &tree_node_stack_)
308 token &top_ = handle_.top ();
310 BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
312 typename macro_map::const_iterator iter_ =
313 macromap_.find (top_._macro);
315 if (iter_ == macromap_.end ())
317 const CharT *name_ = top_._macro;
318 std::basic_stringstream<CharT> ss_;
319 std::ostringstream os_;
321 os_ << "Unknown MACRO name '";
325 os_ << ss_.narrow (*name_++, ' ');
329 throw runtime_error (os_.str ());
332 tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
333 token_stack_.push (token::REPEAT);
336 static void openparen (token_stack &handle_, token_stack &token_stack_)
338 BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
339 handle_.size () == 3);
341 BOOST_ASSERT(handle_.top ()._type == token::REGEX);
343 BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
344 token_stack_.push (token::REPEAT);
347 static void perform_or (node_ptr_vector &node_ptr_vector_,
348 tree_node_stack &tree_node_stack_)
351 node *rhs_ = tree_node_stack_.top ();
353 tree_node_stack_.pop ();
355 node *lhs_ = tree_node_stack_.top ();
357 node_ptr_vector_->push_back (static_cast<selection_node *>(0));
358 node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
359 tree_node_stack_.top () = node_ptr_vector_->back ();
362 static void sequence (node_ptr_vector &node_ptr_vector_,
363 tree_node_stack &tree_node_stack_)
365 node *rhs_ = tree_node_stack_.top ();
367 tree_node_stack_.pop ();
369 node *lhs_ = tree_node_stack_.top ();
371 node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
372 node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
373 tree_node_stack_.top () = node_ptr_vector_->back ();
376 static void optional (const bool greedy_,
377 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
380 node *lhs_ = tree_node_stack_.top ();
381 // You don't know if lhs_ is a leaf_node, so get firstpos.
382 node::node_vector &firstpos_ = lhs_->firstpos ();
384 for (node::node_vector::iterator iter_ = firstpos_.begin (),
385 end_ = firstpos_.end (); iter_ != end_; ++iter_)
387 // These are leaf_nodes!
388 (*iter_)->greedy (greedy_);
391 node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
393 node *rhs_ = new leaf_node (null_token, greedy_);
395 node_ptr_vector_->back () = rhs_;
396 node_ptr_vector_->push_back (static_cast<selection_node *>(0));
397 node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
398 tree_node_stack_.top () = node_ptr_vector_->back ();
401 static void zero_or_more (const bool greedy_,
402 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
405 node *ptr_ = tree_node_stack_.top ();
407 node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
408 node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
409 tree_node_stack_.top () = node_ptr_vector_->back ();
412 static void one_or_more (const bool greedy_,
413 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
416 node *lhs_ = tree_node_stack_.top ();
417 node *copy_ = lhs_->copy (node_ptr_vector_);
419 node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
421 node *rhs_ = new iteration_node (copy_, greedy_);
423 node_ptr_vector_->back () = rhs_;
424 node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
425 node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
426 tree_node_stack_.top () = node_ptr_vector_->back ();
429 // This is one of the most mind bending routines in this code...
430 static void repeatn (const bool greedy_, const token &token_,
431 node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
434 // Semantic checks have already been performed.
438 // therefore we do not check for these cases.
439 if (!(token_._min == 1 && !token_._comma))
441 const std::size_t top_ = token_._min > 0 ?
442 token_._min : token_._max;
444 if (token_._min == 0)
446 optional (greedy_, node_ptr_vector_, tree_node_stack_);
449 node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
452 for (std::size_t i_ = 2; i_ < top_; ++i_)
454 curr_ = prev_->copy (node_ptr_vector_);
455 tree_node_stack_.push (static_cast<node *>(0));
456 tree_node_stack_.top () = prev_;
457 sequence (node_ptr_vector_, tree_node_stack_);
461 if (token_._comma && token_._min > 0)
465 curr_ = prev_->copy (node_ptr_vector_);
466 tree_node_stack_.push (static_cast<node *>(0));
467 tree_node_stack_.top () = prev_;
468 sequence (node_ptr_vector_, tree_node_stack_);
472 if (token_._comma && token_._max)
474 tree_node_stack_.push (static_cast<node *>(0));
475 tree_node_stack_.top () = prev_;
476 optional (greedy_, node_ptr_vector_, tree_node_stack_);
477 prev_ = tree_node_stack_.top ();
478 tree_node_stack_.pop ();
480 const std::size_t count_ = token_._max - token_._min;
482 for (std::size_t i_ = 1; i_ < count_; ++i_)
484 curr_ = prev_->copy (node_ptr_vector_);
485 tree_node_stack_.push (static_cast<node *>(0));
486 tree_node_stack_.top () = prev_;
487 sequence (node_ptr_vector_, tree_node_stack_);
493 tree_node_stack_.push (static_cast<node *>(0));
494 tree_node_stack_.top () = prev_;
495 zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
496 prev_ = tree_node_stack_.top ();
497 tree_node_stack_.pop ();
501 tree_node_stack_.push (static_cast<node *>(0));
502 tree_node_stack_.top () = prev_;
503 sequence (node_ptr_vector_, tree_node_stack_);