]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // parser.hpp |
2 | // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
f67539c2 TL |
6 | #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP |
7 | #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_PARSER_PARSER_HPP | |
7c673cae FG |
8 | |
9 | #include <boost/assert.hpp> | |
10 | #include "tree/end_node.hpp" | |
11 | #include "tree/iteration_node.hpp" | |
12 | #include "tree/leaf_node.hpp" | |
13 | #include "../runtime_error.hpp" | |
14 | #include "tree/selection_node.hpp" | |
15 | #include "tree/sequence_node.hpp" | |
16 | #include "../size_t.hpp" | |
17 | #include "tokeniser/re_tokeniser.hpp" | |
18 | ||
19 | namespace boost | |
20 | { | |
21 | namespace lexer | |
22 | { | |
23 | namespace detail | |
24 | { | |
25 | template<typename CharT> | |
26 | class basic_parser | |
27 | { | |
28 | public: | |
29 | typedef basic_re_tokeniser<CharT> tokeniser; | |
30 | typedef typename tokeniser::string string; | |
31 | typedef std::map<string, const node *> macro_map; | |
32 | typedef node::node_ptr_vector node_ptr_vector; | |
33 | typedef typename tokeniser::num_token token; | |
34 | ||
35 | /* | |
36 | General principles of regex parsing: | |
37 | - Every regex is a sequence of sub-regexes. | |
38 | - Regexes consist of operands and operators | |
39 | - All operators decompose to sequence, selection ('|') and iteration ('*') | |
40 | - Regex tokens are stored on the stack. | |
41 | - When a complete sequence of regex tokens is on the stack it is processed. | |
42 | ||
43 | Grammar: | |
44 | ||
45 | <REGEX> -> <OREXP> | |
46 | <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE> | |
47 | <SEQUENCE> -> <SUB> | |
48 | <SUB> -> <EXPRESSION> | <SUB><EXPRESSION> | |
49 | <EXPRESSION> -> <REPEAT> | |
50 | <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE> | |
51 | <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}' | |
52 | */ | |
53 | static node *parse (const CharT *start_, const CharT * const end_, | |
54 | const std::size_t id_, const std::size_t unique_id_, | |
55 | const std::size_t dfa_state_, const regex_flags flags_, | |
56 | const std::locale &locale_, node_ptr_vector &node_ptr_vector_, | |
57 | const macro_map ¯omap_, typename tokeniser::token_map &map_, | |
58 | bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) | |
59 | { | |
60 | node *root_ = 0; | |
61 | state state_ (start_, end_, flags_, locale_); | |
62 | token lhs_token_; | |
63 | token rhs_token_; | |
64 | token_stack token_stack_; | |
65 | tree_node_stack tree_node_stack_; | |
66 | char action_ = 0; | |
67 | ||
68 | token_stack_.push (rhs_token_); | |
69 | tokeniser::next (state_, map_, rhs_token_); | |
70 | ||
71 | do | |
72 | { | |
73 | lhs_token_ = token_stack_.top (); | |
74 | action_ = lhs_token_.precedence (rhs_token_._type); | |
75 | ||
76 | switch (action_) | |
77 | { | |
78 | case '<': | |
79 | case '=': | |
80 | token_stack_.push (rhs_token_); | |
81 | tokeniser::next (state_, map_, rhs_token_); | |
82 | break; | |
83 | case '>': | |
84 | reduce (token_stack_, macromap_, node_ptr_vector_, | |
85 | tree_node_stack_); | |
86 | break; | |
87 | default: | |
88 | std::ostringstream ss_; | |
89 | ||
90 | ss_ << "A syntax error occurred: '" << | |
91 | lhs_token_.precedence_string () << | |
92 | "' against '" << rhs_token_.precedence_string () << | |
93 | "' at index " << state_.index () << "."; | |
94 | throw runtime_error (ss_.str ().c_str ()); | |
95 | break; | |
96 | } | |
97 | } while (!token_stack_.empty ()); | |
98 | ||
99 | if (tree_node_stack_.empty ()) | |
100 | { | |
101 | throw runtime_error ("Empty rules are not allowed."); | |
102 | } | |
103 | ||
104 | BOOST_ASSERT(tree_node_stack_.size () == 1); | |
105 | ||
106 | node *lhs_node_ = tree_node_stack_.top (); | |
107 | ||
108 | tree_node_stack_.pop (); | |
109 | ||
110 | if (id_ == 0) | |
111 | { | |
112 | // Macros have no end state... | |
113 | root_ = lhs_node_; | |
114 | } | |
115 | else | |
116 | { | |
117 | node_ptr_vector_->push_back (static_cast<end_node *>(0)); | |
118 | ||
119 | node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_); | |
120 | ||
121 | node_ptr_vector_->back () = rhs_node_; | |
122 | node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); | |
123 | node_ptr_vector_->back () = new sequence_node | |
124 | (lhs_node_, rhs_node_); | |
125 | root_ = node_ptr_vector_->back (); | |
126 | } | |
127 | ||
128 | // Done this way as bug in VC++ 6 prevents |= operator working | |
129 | // properly! | |
130 | if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true; | |
131 | ||
132 | if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true; | |
133 | ||
134 | return root_; | |
135 | } | |
136 | ||
137 | private: | |
138 | typedef typename tokeniser::state state; | |
139 | typedef std::stack<token> token_stack; | |
140 | typedef node::node_stack tree_node_stack; | |
141 | ||
142 | static void reduce (token_stack &token_stack_, | |
143 | const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_, | |
144 | tree_node_stack &tree_node_stack_) | |
145 | { | |
146 | typename tokeniser::num_token lhs_; | |
147 | typename tokeniser::num_token rhs_; | |
148 | token_stack handle_; | |
149 | char action_ = 0; | |
150 | ||
151 | do | |
152 | { | |
153 | rhs_ = token_stack_.top (); | |
154 | token_stack_.pop (); | |
155 | handle_.push (rhs_); | |
156 | ||
157 | if (!token_stack_.empty ()) | |
158 | { | |
159 | lhs_ = token_stack_.top (); | |
160 | action_ = lhs_.precedence (rhs_._type); | |
161 | } | |
162 | } while (!token_stack_.empty () && action_ == '='); | |
163 | ||
164 | BOOST_ASSERT(token_stack_.empty () || action_ == '<'); | |
165 | ||
166 | switch (rhs_._type) | |
167 | { | |
168 | case token::BEGIN: | |
169 | // finished processing so exit | |
170 | break; | |
171 | case token::REGEX: | |
172 | // finished parsing, nothing to do | |
173 | break; | |
174 | case token::OREXP: | |
175 | orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); | |
176 | break; | |
177 | case token::SEQUENCE: | |
178 | token_stack_.push (token::OREXP); | |
179 | break; | |
180 | case token::SUB: | |
181 | sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); | |
182 | break; | |
183 | case token::EXPRESSION: | |
184 | token_stack_.push (token::SUB); | |
185 | break; | |
186 | case token::REPEAT: | |
187 | repeat (handle_, token_stack_); | |
188 | break; | |
189 | case token::CHARSET: | |
190 | charset (handle_, token_stack_, node_vector_ptr_, | |
191 | tree_node_stack_); | |
192 | break; | |
193 | case token::MACRO: | |
194 | macro (handle_, token_stack_, macromap_, node_vector_ptr_, | |
195 | tree_node_stack_); | |
196 | break; | |
197 | case token::OPENPAREN: | |
198 | openparen (handle_, token_stack_); | |
199 | break; | |
200 | case token::OPT: | |
201 | case token::AOPT: | |
202 | optional (rhs_._type == token::OPT, node_vector_ptr_, | |
203 | tree_node_stack_); | |
204 | token_stack_.push (token::DUP); | |
205 | break; | |
206 | case token::ZEROORMORE: | |
207 | case token::AZEROORMORE: | |
208 | zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_, | |
209 | tree_node_stack_); | |
210 | token_stack_.push (token::DUP); | |
211 | break; | |
212 | case token::ONEORMORE: | |
213 | case token::AONEORMORE: | |
214 | one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_, | |
215 | tree_node_stack_); | |
216 | token_stack_.push (token::DUP); | |
217 | break; | |
218 | case token::REPEATN: | |
219 | case token::AREPEATN: | |
220 | repeatn (rhs_._type == token::REPEATN, handle_.top (), | |
221 | node_vector_ptr_, tree_node_stack_); | |
222 | token_stack_.push (token::DUP); | |
223 | break; | |
224 | default: | |
225 | throw runtime_error | |
226 | ("Internal error regex_parser::reduce"); | |
227 | break; | |
228 | } | |
229 | } | |
230 | ||
231 | static void orexp (token_stack &handle_, token_stack &token_stack_, | |
232 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
233 | { | |
234 | BOOST_ASSERT(handle_.top ()._type == token::OREXP && | |
235 | (handle_.size () == 1 || handle_.size () == 3)); | |
236 | ||
237 | if (handle_.size () == 1) | |
238 | { | |
239 | token_stack_.push (token::REGEX); | |
240 | } | |
241 | else | |
242 | { | |
243 | handle_.pop (); | |
244 | BOOST_ASSERT(handle_.top ()._type == token::OR); | |
245 | handle_.pop (); | |
246 | BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE); | |
247 | perform_or (node_ptr_vector_, tree_node_stack_); | |
248 | token_stack_.push (token::OREXP); | |
249 | } | |
250 | } | |
251 | ||
252 | static void sub (token_stack &handle_, token_stack &token_stack_, | |
253 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
254 | { | |
255 | BOOST_ASSERT(handle_.top ()._type == token::SUB && | |
256 | (handle_.size () == 1 || handle_.size () == 2)); | |
257 | ||
258 | if (handle_.size () == 1) | |
259 | { | |
260 | token_stack_.push (token::SEQUENCE); | |
261 | } | |
262 | else | |
263 | { | |
264 | handle_.pop (); | |
265 | BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION); | |
266 | // perform join | |
267 | sequence (node_ptr_vector_, tree_node_stack_); | |
268 | token_stack_.push (token::SUB); | |
269 | } | |
270 | } | |
271 | ||
272 | static void repeat (token_stack &handle_, token_stack &token_stack_) | |
273 | { | |
274 | BOOST_ASSERT(handle_.top ()._type == token::REPEAT && | |
275 | handle_.size () >= 1 && handle_.size () <= 3); | |
276 | ||
277 | if (handle_.size () == 1) | |
278 | { | |
279 | token_stack_.push (token::EXPRESSION); | |
280 | } | |
281 | else | |
282 | { | |
283 | handle_.pop (); | |
284 | BOOST_ASSERT(handle_.top ()._type == token::DUP); | |
285 | token_stack_.push (token::REPEAT); | |
286 | } | |
287 | } | |
288 | ||
289 | static void charset (token_stack &handle_, token_stack &token_stack_, | |
290 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
291 | { | |
292 | BOOST_ASSERT(handle_.top ()._type == token::CHARSET && | |
293 | handle_.size () == 1); | |
294 | // store charset | |
295 | node_ptr_vector_->push_back (static_cast<leaf_node *>(0)); | |
296 | ||
297 | const size_t id_ = handle_.top ()._id; | |
298 | ||
299 | node_ptr_vector_->back () = new leaf_node (id_, true); | |
300 | tree_node_stack_.push (node_ptr_vector_->back ()); | |
301 | token_stack_.push (token::REPEAT); | |
302 | } | |
303 | ||
304 | static void macro (token_stack &handle_, token_stack &token_stack_, | |
305 | const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_, | |
306 | tree_node_stack &tree_node_stack_) | |
307 | { | |
308 | token &top_ = handle_.top (); | |
309 | ||
310 | BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1); | |
311 | ||
312 | typename macro_map::const_iterator iter_ = | |
313 | macromap_.find (top_._macro); | |
314 | ||
315 | if (iter_ == macromap_.end ()) | |
316 | { | |
317 | const CharT *name_ = top_._macro; | |
318 | std::basic_stringstream<CharT> ss_; | |
319 | std::ostringstream os_; | |
320 | ||
321 | os_ << "Unknown MACRO name '"; | |
322 | ||
323 | while (*name_) | |
324 | { | |
325 | os_ << ss_.narrow (*name_++, ' '); | |
326 | } | |
327 | ||
328 | os_ << "'."; | |
329 | throw runtime_error (os_.str ()); | |
330 | } | |
331 | ||
332 | tree_node_stack_.push (iter_->second->copy (node_ptr_vector_)); | |
333 | token_stack_.push (token::REPEAT); | |
334 | } | |
335 | ||
336 | static void openparen (token_stack &handle_, token_stack &token_stack_) | |
337 | { | |
338 | BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN && | |
339 | handle_.size () == 3); | |
340 | handle_.pop (); | |
341 | BOOST_ASSERT(handle_.top ()._type == token::REGEX); | |
342 | handle_.pop (); | |
343 | BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN); | |
344 | token_stack_.push (token::REPEAT); | |
345 | } | |
346 | ||
347 | static void perform_or (node_ptr_vector &node_ptr_vector_, | |
348 | tree_node_stack &tree_node_stack_) | |
349 | { | |
350 | // perform or | |
351 | node *rhs_ = tree_node_stack_.top (); | |
352 | ||
353 | tree_node_stack_.pop (); | |
354 | ||
355 | node *lhs_ = tree_node_stack_.top (); | |
356 | ||
357 | node_ptr_vector_->push_back (static_cast<selection_node *>(0)); | |
358 | node_ptr_vector_->back () = new selection_node (lhs_, rhs_); | |
359 | tree_node_stack_.top () = node_ptr_vector_->back (); | |
360 | } | |
361 | ||
362 | static void sequence (node_ptr_vector &node_ptr_vector_, | |
363 | tree_node_stack &tree_node_stack_) | |
364 | { | |
365 | node *rhs_ = tree_node_stack_.top (); | |
366 | ||
367 | tree_node_stack_.pop (); | |
368 | ||
369 | node *lhs_ = tree_node_stack_.top (); | |
370 | ||
371 | node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); | |
372 | node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); | |
373 | tree_node_stack_.top () = node_ptr_vector_->back (); | |
374 | } | |
375 | ||
376 | static void optional (const bool greedy_, | |
377 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
378 | { | |
379 | // perform ? | |
380 | node *lhs_ = tree_node_stack_.top (); | |
381 | // You don't know if lhs_ is a leaf_node, so get firstpos. | |
382 | node::node_vector &firstpos_ = lhs_->firstpos (); | |
383 | ||
384 | for (node::node_vector::iterator iter_ = firstpos_.begin (), | |
385 | end_ = firstpos_.end (); iter_ != end_; ++iter_) | |
386 | { | |
387 | // These are leaf_nodes! | |
388 | (*iter_)->greedy (greedy_); | |
389 | } | |
390 | ||
391 | node_ptr_vector_->push_back (static_cast<leaf_node *>(0)); | |
392 | ||
393 | node *rhs_ = new leaf_node (null_token, greedy_); | |
394 | ||
395 | node_ptr_vector_->back () = rhs_; | |
396 | node_ptr_vector_->push_back (static_cast<selection_node *>(0)); | |
397 | node_ptr_vector_->back () = new selection_node (lhs_, rhs_); | |
398 | tree_node_stack_.top () = node_ptr_vector_->back (); | |
399 | } | |
400 | ||
401 | static void zero_or_more (const bool greedy_, | |
402 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
403 | { | |
404 | // perform * | |
405 | node *ptr_ = tree_node_stack_.top (); | |
406 | ||
407 | node_ptr_vector_->push_back (static_cast<iteration_node *>(0)); | |
408 | node_ptr_vector_->back () = new iteration_node (ptr_, greedy_); | |
409 | tree_node_stack_.top () = node_ptr_vector_->back (); | |
410 | } | |
411 | ||
412 | static void one_or_more (const bool greedy_, | |
413 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
414 | { | |
415 | // perform + | |
416 | node *lhs_ = tree_node_stack_.top (); | |
417 | node *copy_ = lhs_->copy (node_ptr_vector_); | |
418 | ||
419 | node_ptr_vector_->push_back (static_cast<iteration_node *>(0)); | |
420 | ||
421 | node *rhs_ = new iteration_node (copy_, greedy_); | |
422 | ||
423 | node_ptr_vector_->back () = rhs_; | |
424 | node_ptr_vector_->push_back (static_cast<sequence_node *>(0)); | |
425 | node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); | |
426 | tree_node_stack_.top () = node_ptr_vector_->back (); | |
427 | } | |
428 | ||
429 | // This is one of the most mind bending routines in this code... | |
430 | static void repeatn (const bool greedy_, const token &token_, | |
431 | node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) | |
432 | { | |
433 | // perform {n[,[m]]} | |
434 | // Semantic checks have already been performed. | |
435 | // {0,} = * | |
436 | // {0,1} = ? | |
437 | // {1,} = + | |
438 | // therefore we do not check for these cases. | |
439 | if (!(token_._min == 1 && !token_._comma)) | |
440 | { | |
441 | const std::size_t top_ = token_._min > 0 ? | |
442 | token_._min : token_._max; | |
443 | ||
444 | if (token_._min == 0) | |
445 | { | |
446 | optional (greedy_, node_ptr_vector_, tree_node_stack_); | |
447 | } | |
448 | ||
449 | node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_); | |
450 | node *curr_ = 0; | |
451 | ||
452 | for (std::size_t i_ = 2; i_ < top_; ++i_) | |
453 | { | |
454 | curr_ = prev_->copy (node_ptr_vector_); | |
455 | tree_node_stack_.push (static_cast<node *>(0)); | |
456 | tree_node_stack_.top () = prev_; | |
457 | sequence (node_ptr_vector_, tree_node_stack_); | |
458 | prev_ = curr_; | |
459 | } | |
460 | ||
461 | if (token_._comma && token_._min > 0) | |
462 | { | |
463 | if (token_._min > 1) | |
464 | { | |
465 | curr_ = prev_->copy (node_ptr_vector_); | |
466 | tree_node_stack_.push (static_cast<node *>(0)); | |
467 | tree_node_stack_.top () = prev_; | |
468 | sequence (node_ptr_vector_, tree_node_stack_); | |
469 | prev_ = curr_; | |
470 | } | |
471 | ||
472 | if (token_._comma && token_._max) | |
473 | { | |
474 | tree_node_stack_.push (static_cast<node *>(0)); | |
475 | tree_node_stack_.top () = prev_; | |
476 | optional (greedy_, node_ptr_vector_, tree_node_stack_); | |
477 | prev_ = tree_node_stack_.top (); | |
478 | tree_node_stack_.pop (); | |
479 | ||
480 | const std::size_t count_ = token_._max - token_._min; | |
481 | ||
482 | for (std::size_t i_ = 1; i_ < count_; ++i_) | |
483 | { | |
484 | curr_ = prev_->copy (node_ptr_vector_); | |
485 | tree_node_stack_.push (static_cast<node *>(0)); | |
486 | tree_node_stack_.top () = prev_; | |
487 | sequence (node_ptr_vector_, tree_node_stack_); | |
488 | prev_ = curr_; | |
489 | } | |
490 | } | |
491 | else | |
492 | { | |
493 | tree_node_stack_.push (static_cast<node *>(0)); | |
494 | tree_node_stack_.top () = prev_; | |
495 | zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_); | |
496 | prev_ = tree_node_stack_.top (); | |
497 | tree_node_stack_.pop (); | |
498 | } | |
499 | } | |
500 | ||
501 | tree_node_stack_.push (static_cast<node *>(0)); | |
502 | tree_node_stack_.top () = prev_; | |
503 | sequence (node_ptr_vector_, tree_node_stack_); | |
504 | } | |
505 | } | |
506 | }; | |
507 | } | |
508 | } | |
509 | } | |
510 | ||
511 | #endif |