]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*============================================================================= |
2 | Boost.Wave: A Standard compliant C++ preprocessor library | |
3 | ||
4 | Spirit based lexer | |
5 | ||
6 | http://www.boost.org/ | |
7 | ||
8 | Copyright (c) 2001, Daniel C. Nuffer. | |
9 | Copyright (c) 2001-2012 Hartmut Kaiser. | |
10 | Distributed under the Boost Software License, Version 1.0. (See accompanying | |
11 | file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
12 | ||
13 | TODO List: | |
14 | X callback objects (called when a match is made.) | |
15 | X callback passed first & last iterator, and | |
16 | a reference to a lexercontrol object that supports some | |
17 | operations on the lexer. | |
18 | set state | |
19 | terminate | |
20 | state stack (push, pop, top) | |
21 | set new token return value | |
22 | ignore the current token | |
23 | yymore | |
24 | get length of matched token | |
25 | get current lexer state | |
26 | X DFA serialization to save recomputing the DFA. | |
27 | ||
28 | lexer states. | |
29 | organize the file into hpp and ipp. arrange the classes in a logical order. | |
30 | use buffering - iterator only needs be an input iterator, | |
31 | lexer & callback are not templatized on iterator type, but char type. | |
32 | next_token is templatized on iterator type. | |
33 | beginning/ending contexts. | |
34 | ^ and $ | |
35 | DFA minimization. | |
36 | DFA table compression. | |
37 | ||
38 | =============================================================================*/ | |
39 | #ifndef BOOST_SPIRIT_LEXER_HPP | |
40 | #define BOOST_SPIRIT_LEXER_HPP | |
41 | ||
42 | /////////////////////////////////////////////////////////////////////////////// | |
92f5a8d4 | 43 | #include <boost/config.hpp> |
7c673cae FG |
44 | #include <boost/throw_exception.hpp> |
45 | ||
46 | #include <boost/spirit/include/classic_core.hpp> | |
47 | #include <boost/spirit/include/classic_symbols.hpp> | |
48 | #include <boost/spirit/include/classic_chset.hpp> | |
49 | #include <boost/spirit/include/classic_escape_char.hpp> | |
50 | ||
51 | #include <set> | |
52 | #include <map> | |
92f5a8d4 | 53 | #include <memory> // for auto_ptr/unique_ptr |
7c673cae FG |
54 | #include <vector> |
55 | #include <stack> | |
56 | #include <utility> // for pair | |
20effc67 | 57 | #if defined(BOOST_SPIRIT_DEBUG) |
7c673cae | 58 | #include <iostream> |
20effc67 | 59 | #endif |
7c673cae FG |
60 | #include <fstream> |
61 | #include <boost/assert.hpp> | |
62 | #include <boost/limits.hpp> | |
63 | ||
64 | #if defined(BOOST_NO_STD_ITERATOR_TRAITS) | |
65 | #define BOOST_SPIRIT_IT_NS impl | |
66 | #else | |
67 | #define BOOST_SPIRIT_IT_NS std | |
68 | #endif | |
69 | ||
70 | /////////////////////////////////////////////////////////////////////////////// | |
71 | namespace boost { | |
72 | namespace spirit { | |
73 | namespace classic { | |
74 | ||
75 | typedef unsigned char uchar; | |
76 | typedef unsigned int node_id_t; | |
77 | const node_id_t invalid_node = node_id_t(-1); | |
78 | typedef std::set<node_id_t> node_set; | |
79 | typedef std::vector<uchar> uchar_vector; | |
80 | typedef std::map<node_id_t, node_set> followpos_t; | |
81 | typedef std::vector<uchar_vector> state_match_t; | |
82 | ||
83 | template <typename TokenT> | |
84 | class lexer_control; | |
85 | ||
86 | class bad_regex : public std::exception | |
87 | { | |
88 | }; | |
89 | ||
90 | namespace lexerimpl | |
91 | { | |
92 | ||
93 | class node | |
94 | { | |
95 | public: | |
96 | ||
97 | virtual ~node() {} | |
98 | ||
99 | virtual node* clone() const = 0; | |
100 | virtual bool nullable() const = 0; | |
101 | virtual node_set firstpos() const = 0; | |
102 | virtual node_set lastpos() const = 0; | |
103 | virtual void compute_followpos(followpos_t& followpos) const = 0; | |
104 | virtual void compute_state_match(state_match_t& state_match) const = 0; | |
105 | virtual void get_eof_ids(node_set& eof_set) const = 0; | |
106 | virtual void assign_node_ids(node_id_t& node_count) = 0; | |
107 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
108 | virtual void dump(std::ostream& out) const = 0; | |
109 | #endif | |
110 | ||
111 | }; | |
112 | ||
113 | class char_node : public node | |
114 | { | |
115 | ||
116 | public: | |
117 | ||
118 | char_node(const uchar c); | |
119 | char_node(const char_node& x); | |
120 | virtual ~char_node(){} | |
121 | ||
20effc67 TL |
122 | node* clone() const BOOST_OVERRIDE; |
123 | bool nullable() const BOOST_OVERRIDE; | |
124 | node_set firstpos() const BOOST_OVERRIDE; | |
125 | node_set lastpos() const BOOST_OVERRIDE; | |
126 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
127 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
128 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
129 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 130 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 131 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
132 | #endif |
133 | ||
134 | private: | |
135 | ||
136 | uchar m_char; | |
137 | node_id_t m_node_num; | |
138 | }; | |
139 | ||
140 | inline | |
141 | char_node::char_node(const uchar c) | |
142 | : node() | |
143 | , m_char(c) | |
144 | , m_node_num(0) | |
145 | { | |
146 | } | |
147 | ||
148 | inline | |
149 | char_node::char_node(const char_node& x) | |
150 | : node(x) | |
151 | , m_char(x.m_char) | |
152 | , m_node_num(x.m_node_num) | |
153 | { | |
154 | } | |
155 | ||
156 | inline node * | |
157 | char_node::clone() const | |
158 | { | |
159 | return new char_node(*this); | |
160 | } | |
161 | ||
162 | inline bool | |
163 | char_node::nullable() const | |
164 | { | |
165 | return false; | |
166 | } | |
167 | ||
168 | inline node_set | |
169 | char_node::firstpos() const | |
170 | { | |
171 | node_set rval; | |
172 | rval.insert(m_node_num); | |
173 | return rval; | |
174 | } | |
175 | ||
176 | inline node_set | |
177 | char_node::lastpos() const | |
178 | { | |
179 | return firstpos(); | |
180 | } | |
181 | ||
182 | inline void | |
183 | char_node::compute_followpos(followpos_t&) const | |
184 | { | |
185 | return; | |
186 | } | |
187 | ||
188 | inline void | |
189 | char_node::compute_state_match(state_match_t& state_match) const | |
190 | { | |
191 | if (state_match.size() < m_node_num + 1) | |
192 | state_match.resize(m_node_num + 1); | |
193 | state_match[m_node_num].resize(256); | |
194 | state_match[m_node_num][m_char] = 1; | |
195 | } | |
196 | ||
197 | inline void | |
198 | char_node::get_eof_ids(node_set&) const | |
199 | { | |
200 | return; | |
201 | } | |
202 | ||
203 | inline void | |
204 | char_node::assign_node_ids(node_id_t& node_count) | |
205 | { | |
206 | m_node_num = node_count++; | |
207 | } | |
208 | ||
209 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
210 | inline void | |
211 | char_node::dump(std::ostream& out) const | |
212 | { | |
213 | out << "\nchar_node m_char = " << m_char; | |
214 | out << " m_node_num = " << m_node_num; | |
215 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
216 | out << " firstpos() = "; | |
217 | node_set fp = firstpos(); | |
218 | std::copy(fp.begin(), fp.end(), | |
219 | std::ostream_iterator<node_id_t>(out, ",")); | |
220 | ||
221 | out << " lastpos() = "; | |
222 | node_set lp = lastpos(); | |
223 | std::copy(lp.begin(), lp.end(), | |
224 | std::ostream_iterator<node_id_t>(out, ",")); | |
225 | } | |
226 | #endif | |
227 | ||
228 | ||
229 | class epsilon_node : public node | |
230 | { | |
231 | ||
232 | public: | |
233 | ||
234 | epsilon_node(); | |
235 | epsilon_node(const epsilon_node& x); | |
236 | virtual ~epsilon_node(){} | |
237 | ||
20effc67 TL |
238 | node* clone() const BOOST_OVERRIDE; |
239 | bool nullable() const BOOST_OVERRIDE; | |
240 | node_set firstpos() const BOOST_OVERRIDE; | |
241 | node_set lastpos() const BOOST_OVERRIDE; | |
242 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
243 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
244 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
245 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 246 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 247 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
248 | #endif |
249 | ||
250 | private: | |
251 | ||
252 | node_id_t m_node_num; | |
253 | }; | |
254 | ||
255 | inline | |
256 | epsilon_node::epsilon_node() | |
257 | : node() | |
258 | , m_node_num(0) | |
259 | { | |
260 | } | |
261 | ||
262 | inline | |
263 | epsilon_node::epsilon_node(const epsilon_node& x) | |
264 | : node(x) | |
265 | , m_node_num(x.m_node_num) | |
266 | { | |
267 | } | |
268 | ||
269 | inline node * | |
270 | epsilon_node::clone() const | |
271 | { | |
272 | return new epsilon_node(*this); | |
273 | } | |
274 | ||
275 | inline bool | |
276 | epsilon_node::nullable() const | |
277 | { | |
278 | return true; | |
279 | } | |
280 | ||
281 | inline node_set | |
282 | epsilon_node::firstpos() const | |
283 | { | |
284 | return node_set(); | |
285 | } | |
286 | ||
287 | inline node_set | |
288 | epsilon_node::lastpos() const | |
289 | { | |
290 | return node_set(); | |
291 | } | |
292 | ||
293 | inline void | |
294 | epsilon_node::compute_followpos(followpos_t&) const | |
295 | { | |
296 | return; | |
297 | } | |
298 | ||
299 | inline void | |
300 | epsilon_node::compute_state_match(state_match_t& state_match) const | |
301 | { | |
302 | if (state_match.size() < m_node_num + 1) | |
303 | state_match.resize(m_node_num + 1); | |
304 | state_match[m_node_num].resize(256, 1); | |
305 | } | |
306 | ||
307 | inline void | |
308 | epsilon_node::get_eof_ids(node_set&) const | |
309 | { | |
310 | return; | |
311 | } | |
312 | ||
313 | inline void | |
314 | epsilon_node::assign_node_ids(node_id_t& node_count) | |
315 | { | |
316 | m_node_num = node_count++; | |
317 | } | |
318 | ||
319 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
320 | inline void | |
321 | epsilon_node::dump(std::ostream& out) const | |
322 | { | |
323 | out << "\nepsilon_node"; | |
324 | out << " m_node_num = " << m_node_num; | |
325 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
326 | out << " firstpos() = "; | |
327 | node_set fp = firstpos(); | |
328 | std::copy(fp.begin(), fp.end(), | |
329 | std::ostream_iterator<node_id_t>(out, ",")); | |
330 | ||
331 | out << " lastpos() = "; | |
332 | node_set lp = lastpos(); | |
333 | std::copy(lp.begin(), lp.end(), | |
334 | std::ostream_iterator<node_id_t>(out, ",")); | |
335 | } | |
336 | #endif | |
337 | ||
338 | ||
339 | class or_node : public node | |
340 | { | |
341 | ||
342 | public: | |
343 | ||
344 | or_node(node* left, node* right); | |
345 | or_node(const or_node& x); | |
346 | virtual ~or_node(){} | |
347 | ||
20effc67 TL |
348 | node* clone() const BOOST_OVERRIDE; |
349 | bool nullable() const BOOST_OVERRIDE; | |
350 | node_set firstpos() const BOOST_OVERRIDE; | |
351 | node_set lastpos() const BOOST_OVERRIDE; | |
352 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
353 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
354 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
355 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 356 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 357 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
358 | #endif |
359 | ||
360 | private: | |
361 | ||
92f5a8d4 | 362 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
363 | std::unique_ptr<node> m_left; |
364 | std::unique_ptr<node> m_right; | |
365 | #else | |
7c673cae FG |
366 | std::auto_ptr<node> m_left; |
367 | std::auto_ptr<node> m_right; | |
b32b8144 | 368 | #endif |
7c673cae FG |
369 | }; |
370 | ||
371 | inline | |
372 | or_node::or_node(node* left, node* right) | |
373 | : node() | |
374 | , m_left(left) | |
375 | , m_right(right) | |
376 | { | |
377 | } | |
378 | ||
379 | inline | |
380 | or_node::or_node(const or_node& x) | |
381 | : node(x) | |
382 | , m_left(x.m_left->clone()) | |
383 | , m_right(x.m_right->clone()) | |
384 | { | |
385 | } | |
386 | ||
387 | inline node * | |
388 | or_node::clone() const | |
389 | { | |
390 | return new or_node(m_left->clone(), m_right->clone()); | |
391 | } | |
392 | ||
393 | inline bool | |
394 | or_node::nullable() const | |
395 | { | |
396 | return m_left->nullable() || m_right->nullable(); | |
397 | } | |
398 | ||
399 | inline node_set | |
400 | or_node::firstpos() const | |
401 | { | |
402 | node_set rval; | |
403 | node_set l = m_left->firstpos(); | |
404 | node_set r = m_right->firstpos(); | |
405 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
406 | std::inserter(rval, rval.begin())); | |
407 | return rval; | |
408 | } | |
409 | ||
410 | inline node_set | |
411 | or_node::lastpos() const | |
412 | { | |
413 | node_set rval; | |
414 | node_set l = m_left->lastpos(); | |
415 | node_set r = m_right->lastpos(); | |
416 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
417 | std::inserter(rval, rval.begin())); | |
418 | return rval; | |
419 | } | |
420 | ||
421 | inline void | |
422 | or_node::compute_followpos(followpos_t& followpos) const | |
423 | { | |
424 | m_left->compute_followpos(followpos); | |
425 | m_right->compute_followpos(followpos); | |
426 | } | |
427 | ||
428 | inline void | |
429 | or_node::compute_state_match(state_match_t& state_match) const | |
430 | { | |
431 | m_left->compute_state_match(state_match); | |
432 | m_right->compute_state_match(state_match); | |
433 | } | |
434 | ||
435 | inline void | |
436 | or_node::get_eof_ids(node_set& eof_nodes) const | |
437 | { | |
438 | m_left->get_eof_ids(eof_nodes); | |
439 | m_right->get_eof_ids(eof_nodes); | |
440 | } | |
441 | ||
442 | inline void | |
443 | or_node::assign_node_ids(node_id_t& node_count) | |
444 | { | |
445 | m_left->assign_node_ids(node_count); | |
446 | m_right->assign_node_ids(node_count); | |
447 | } | |
448 | ||
449 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
450 | inline void | |
451 | or_node::dump(std::ostream& out) const | |
452 | { | |
453 | m_left->dump(out); | |
454 | ||
455 | out << "\nor_node"; | |
456 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
457 | out << " firstpos() = "; | |
458 | node_set fp = firstpos(); | |
459 | std::copy(fp.begin(), fp.end(), | |
460 | std::ostream_iterator<node_id_t>(out, ",")); | |
461 | ||
462 | out << " lastpos() = "; | |
463 | node_set lp = lastpos(); | |
464 | std::copy(lp.begin(), lp.end(), | |
465 | std::ostream_iterator<node_id_t>(out, ",")); | |
466 | ||
467 | m_right->dump(out); | |
468 | } | |
469 | #endif | |
470 | ||
471 | ||
472 | class cat_node : public node | |
473 | { | |
474 | ||
475 | public: | |
476 | ||
477 | cat_node(node* left, node* right); | |
478 | cat_node(const cat_node& x); | |
479 | virtual ~cat_node(){} | |
480 | ||
20effc67 TL |
481 | node* clone() const BOOST_OVERRIDE; |
482 | bool nullable() const BOOST_OVERRIDE; | |
483 | node_set firstpos() const BOOST_OVERRIDE; | |
484 | node_set lastpos() const BOOST_OVERRIDE; | |
485 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
486 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
487 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
488 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 489 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 490 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
491 | #endif |
492 | ||
493 | private: | |
494 | ||
92f5a8d4 | 495 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
496 | std::unique_ptr<node> m_left; |
497 | std::unique_ptr<node> m_right; | |
498 | #else | |
7c673cae FG |
499 | std::auto_ptr<node> m_left; |
500 | std::auto_ptr<node> m_right; | |
b32b8144 | 501 | #endif |
7c673cae FG |
502 | }; |
503 | ||
504 | inline | |
505 | cat_node::cat_node(node* left, node* right) | |
506 | : node() | |
507 | , m_left(left) | |
508 | , m_right(right) | |
509 | { | |
510 | } | |
511 | ||
512 | inline | |
513 | cat_node::cat_node(const cat_node& x) | |
514 | : node(x) | |
515 | , m_left(x.m_left->clone()) | |
516 | , m_right(x.m_right->clone()) | |
517 | { | |
518 | } | |
519 | ||
520 | inline node * | |
521 | cat_node::clone() const | |
522 | { | |
523 | return new cat_node(m_left->clone(), m_right->clone()); | |
524 | } | |
525 | ||
526 | inline bool | |
527 | cat_node::nullable() const | |
528 | { | |
529 | return m_left->nullable() && m_right->nullable(); | |
530 | } | |
531 | ||
532 | inline node_set | |
533 | cat_node::firstpos() const | |
534 | { | |
535 | if (m_left->nullable()) | |
536 | { | |
537 | node_set rval; | |
538 | node_set l = m_left->firstpos(); | |
539 | node_set r = m_right->firstpos(); | |
540 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
541 | std::inserter(rval, rval.begin())); | |
542 | return rval; | |
543 | } | |
544 | else | |
545 | { | |
546 | return m_left->firstpos(); | |
547 | } | |
548 | } | |
549 | ||
550 | inline node_set | |
551 | cat_node::lastpos() const | |
552 | { | |
553 | if (m_right->nullable()) | |
554 | { | |
555 | node_set rval; | |
556 | node_set l = m_left->lastpos(); | |
557 | node_set r = m_right->lastpos(); | |
558 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
559 | std::inserter(rval, rval.begin())); | |
560 | return rval; | |
561 | } | |
562 | else | |
563 | { | |
564 | return m_right->lastpos(); | |
565 | } | |
566 | } | |
567 | ||
568 | inline void | |
569 | cat_node::compute_followpos(followpos_t& followpos) const | |
570 | { | |
571 | node_set l = m_left->lastpos(); | |
572 | for (node_set::iterator i = l.begin(); | |
573 | i != l.end(); | |
574 | ++i) | |
575 | { | |
576 | node_set rf = m_right->firstpos(); | |
577 | followpos[*i].insert(rf.begin(), rf.end()); | |
578 | } | |
579 | ||
580 | m_left->compute_followpos(followpos); | |
581 | m_right->compute_followpos(followpos); | |
582 | } | |
583 | ||
584 | inline void | |
585 | cat_node::compute_state_match(state_match_t& state_match) const | |
586 | { | |
587 | m_left->compute_state_match(state_match); | |
588 | m_right->compute_state_match(state_match); | |
589 | } | |
590 | ||
591 | inline void | |
592 | cat_node::get_eof_ids(node_set& eof_nodes) const | |
593 | { | |
594 | m_left->get_eof_ids(eof_nodes); | |
595 | m_right->get_eof_ids(eof_nodes); | |
596 | } | |
597 | ||
598 | inline void | |
599 | cat_node::assign_node_ids(node_id_t& node_count) | |
600 | { | |
601 | m_left->assign_node_ids(node_count); | |
602 | m_right->assign_node_ids(node_count); | |
603 | } | |
604 | ||
605 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
606 | inline void | |
607 | cat_node::dump(std::ostream& out) const | |
608 | { | |
609 | m_left->dump(out); | |
610 | ||
611 | out << "\ncat_node"; | |
612 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
613 | out << " firstpos() = "; | |
614 | node_set fp = firstpos(); | |
615 | std::copy(fp.begin(), fp.end(), | |
616 | std::ostream_iterator<node_id_t>(out, ",")); | |
617 | ||
618 | out << " lastpos() = "; | |
619 | node_set lp = lastpos(); | |
620 | std::copy(lp.begin(), lp.end(), | |
621 | std::ostream_iterator<node_id_t>(out, ",")); | |
622 | ||
623 | m_right->dump(out); | |
624 | } | |
625 | #endif | |
626 | ||
627 | ||
628 | class star_node : public node | |
629 | { | |
630 | ||
631 | public: | |
632 | ||
633 | star_node(node* left); | |
634 | star_node(const star_node& x); | |
635 | virtual ~star_node(){} | |
636 | ||
20effc67 TL |
637 | node* clone() const BOOST_OVERRIDE; |
638 | bool nullable() const BOOST_OVERRIDE; | |
639 | node_set firstpos() const BOOST_OVERRIDE; | |
640 | node_set lastpos() const BOOST_OVERRIDE; | |
641 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
642 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
643 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
644 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 645 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 646 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
647 | #endif |
648 | ||
649 | private: | |
650 | ||
92f5a8d4 | 651 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
652 | std::unique_ptr<node> m_left; |
653 | #else | |
7c673cae | 654 | std::auto_ptr<node> m_left; |
b32b8144 | 655 | #endif |
7c673cae FG |
656 | }; |
657 | ||
658 | inline | |
659 | star_node::star_node(node* left) | |
660 | : node() | |
661 | , m_left(left) | |
662 | { | |
663 | } | |
664 | ||
665 | inline | |
666 | star_node::star_node(const star_node& x) | |
667 | : node(x) | |
668 | , m_left(x.m_left->clone()) | |
669 | { | |
670 | } | |
671 | ||
672 | inline node * | |
673 | star_node::clone() const | |
674 | { | |
675 | return new star_node(m_left->clone()); | |
676 | } | |
677 | ||
678 | inline bool | |
679 | star_node::nullable() const | |
680 | { | |
681 | return true; | |
682 | } | |
683 | ||
684 | inline node_set | |
685 | star_node::firstpos() const | |
686 | { | |
687 | return m_left->firstpos(); | |
688 | } | |
689 | ||
690 | inline node_set | |
691 | star_node::lastpos() const | |
692 | { | |
693 | return m_left->lastpos(); | |
694 | } | |
695 | ||
696 | inline void | |
697 | star_node::compute_followpos(followpos_t& followpos) const | |
698 | { | |
699 | node_set lp = this->lastpos(); | |
700 | for (node_set::iterator i = lp.begin(); | |
701 | i != lp.end(); | |
702 | ++i) | |
703 | { | |
704 | node_set fp = this->firstpos(); | |
705 | followpos[*i].insert(fp.begin(), fp.end()); | |
706 | } | |
707 | ||
708 | m_left->compute_followpos(followpos); | |
709 | } | |
710 | ||
711 | inline void | |
712 | star_node::compute_state_match(state_match_t& state_match) const | |
713 | { | |
714 | m_left->compute_state_match(state_match); | |
715 | } | |
716 | ||
717 | inline void | |
718 | star_node::get_eof_ids(node_set& eof_nodes) const | |
719 | { | |
720 | m_left->get_eof_ids(eof_nodes); | |
721 | } | |
722 | ||
723 | inline void | |
724 | star_node::assign_node_ids(node_id_t& node_count) | |
725 | { | |
726 | m_left->assign_node_ids(node_count); | |
727 | } | |
728 | ||
729 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
730 | inline void | |
731 | star_node::dump(std::ostream& out) const | |
732 | { | |
733 | m_left->dump(out); | |
734 | ||
735 | out << "\nstar_node"; | |
736 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
737 | out << " firstpos() = "; | |
738 | node_set fp = firstpos(); | |
739 | std::copy(fp.begin(), fp.end(), | |
740 | std::ostream_iterator<node_id_t>(out, ",")); | |
741 | ||
742 | out << " lastpos() = "; | |
743 | node_set lp = lastpos(); | |
744 | std::copy(lp.begin(), lp.end(), | |
745 | std::ostream_iterator<node_id_t>(out, ",")); | |
746 | ||
747 | } | |
748 | #endif | |
749 | ||
750 | ||
751 | class eof_node : public node | |
752 | { | |
753 | ||
754 | public: | |
755 | ||
756 | eof_node(); | |
757 | eof_node(const eof_node& x); | |
758 | virtual ~eof_node(){} | |
759 | ||
20effc67 TL |
760 | node* clone() const BOOST_OVERRIDE; |
761 | bool nullable() const BOOST_OVERRIDE; | |
762 | node_set firstpos() const BOOST_OVERRIDE; | |
763 | node_set lastpos() const BOOST_OVERRIDE; | |
764 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
765 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
766 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
767 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 768 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 769 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
770 | #endif |
771 | ||
772 | private: | |
773 | ||
774 | node_id_t m_node_num; | |
775 | }; | |
776 | ||
777 | inline | |
778 | eof_node::eof_node() | |
779 | : node() | |
780 | , m_node_num(0) | |
781 | { | |
782 | } | |
783 | ||
784 | inline | |
785 | eof_node::eof_node(const eof_node& x) | |
786 | : node(x) | |
787 | , m_node_num(x.m_node_num) | |
788 | { | |
789 | } | |
790 | ||
791 | inline node * | |
792 | eof_node::clone() const | |
793 | { | |
794 | return new eof_node(*this); | |
795 | } | |
796 | ||
797 | inline bool | |
798 | eof_node::nullable() const | |
799 | { | |
800 | return false; | |
801 | } | |
802 | ||
803 | inline node_set | |
804 | eof_node::firstpos() const | |
805 | { | |
806 | node_set rval; | |
807 | rval.insert(m_node_num); | |
808 | return rval; | |
809 | } | |
810 | ||
811 | inline node_set | |
812 | eof_node::lastpos() const | |
813 | { | |
814 | node_set rval; | |
815 | rval.insert(m_node_num); | |
816 | return rval; | |
817 | } | |
818 | ||
819 | inline void | |
820 | eof_node::compute_followpos(followpos_t&) const | |
821 | { | |
822 | return; | |
823 | } | |
824 | ||
825 | inline void | |
826 | eof_node::compute_state_match(state_match_t& state_match) const | |
827 | { | |
828 | if (state_match.size() < m_node_num + 1) | |
829 | state_match.resize(m_node_num + 1); | |
830 | state_match[m_node_num].resize(256, 0); | |
831 | } | |
832 | ||
833 | inline void | |
834 | eof_node::get_eof_ids(node_set& eof_nodes) const | |
835 | { | |
836 | eof_nodes.insert(m_node_num); | |
837 | } | |
838 | ||
839 | inline void | |
840 | eof_node::assign_node_ids(node_id_t& node_count) | |
841 | { | |
842 | m_node_num = node_count++; | |
843 | } | |
844 | ||
845 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
846 | inline void | |
847 | eof_node::dump(std::ostream& out) const | |
848 | { | |
849 | out << "\neof_node"; | |
850 | out << " m_node_num = " << m_node_num; | |
851 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
852 | out << " firstpos() = "; | |
853 | node_set fp = firstpos(); | |
854 | std::copy(fp.begin(), fp.end(), | |
855 | std::ostream_iterator<node_id_t>(out, ",")); | |
856 | ||
857 | out << " lastpos() = "; | |
858 | node_set lp = lastpos(); | |
859 | std::copy(lp.begin(), lp.end(), | |
860 | std::ostream_iterator<node_id_t>(out, ",")); | |
861 | } | |
862 | #endif | |
863 | ||
864 | class ccl_node : public node | |
865 | { | |
866 | ||
867 | public: | |
868 | ||
869 | ccl_node(const std::vector<uchar>& v); | |
870 | ccl_node(const uchar c1, const uchar c2); | |
871 | ccl_node(const ccl_node& x); | |
872 | virtual ~ccl_node(){} | |
873 | ||
20effc67 TL |
874 | node* clone() const BOOST_OVERRIDE; |
875 | bool nullable() const BOOST_OVERRIDE; | |
876 | node_set firstpos() const BOOST_OVERRIDE; | |
877 | node_set lastpos() const BOOST_OVERRIDE; | |
878 | void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE; | |
879 | void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE; | |
880 | void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE; | |
881 | void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE; | |
7c673cae | 882 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
20effc67 | 883 | void dump(std::ostream& out) const BOOST_OVERRIDE; |
7c673cae FG |
884 | #endif |
885 | ||
886 | private: | |
887 | ||
888 | std::vector<uchar> m_match; | |
889 | node_id_t m_node_num; | |
890 | }; | |
891 | ||
892 | inline | |
893 | ccl_node::ccl_node(const std::vector<uchar>& v) | |
894 | : node() | |
895 | , m_match(v) | |
896 | , m_node_num(0) | |
897 | { | |
898 | m_match.resize(256); // make sure it's the right size | |
899 | } | |
900 | ||
901 | inline | |
902 | ccl_node::ccl_node(const uchar c1, const uchar c2) | |
903 | : node() | |
904 | , m_match(256, uchar(0)) | |
905 | , m_node_num(0) | |
906 | { | |
907 | BOOST_ASSERT(c1 < c2); | |
908 | for (std::size_t i = c1; i <= std::size_t(c2); ++i) | |
909 | { | |
910 | m_match[i] = 1; | |
911 | } | |
912 | } | |
913 | ||
914 | inline | |
915 | ccl_node::ccl_node(const ccl_node& x) | |
916 | : node(x) | |
917 | , m_match(x.m_match) | |
918 | , m_node_num(x.m_node_num) | |
919 | { | |
920 | } | |
921 | ||
922 | inline node * | |
923 | ccl_node::clone() const | |
924 | { | |
925 | return new ccl_node(*this); | |
926 | } | |
927 | ||
928 | inline bool | |
929 | ccl_node::nullable() const | |
930 | { | |
931 | return false; | |
932 | } | |
933 | ||
934 | inline node_set | |
935 | ccl_node::firstpos() const | |
936 | { | |
937 | node_set rval; | |
938 | rval.insert(m_node_num); | |
939 | return rval; | |
940 | } | |
941 | ||
942 | inline node_set | |
943 | ccl_node::lastpos() const | |
944 | { | |
945 | return firstpos(); | |
946 | } | |
947 | ||
948 | inline void | |
949 | ccl_node::compute_followpos(followpos_t&) const | |
950 | { | |
951 | return; | |
952 | } | |
953 | ||
954 | inline void | |
955 | ccl_node::compute_state_match(state_match_t& state_match) const | |
956 | { | |
957 | if (state_match.size() < m_node_num + 1) | |
958 | state_match.resize(m_node_num + 1); | |
959 | state_match[m_node_num] = m_match; | |
960 | } | |
961 | ||
962 | inline void | |
963 | ccl_node::get_eof_ids(node_set&) const | |
964 | { | |
965 | return; | |
966 | } | |
967 | ||
968 | inline void | |
969 | ccl_node::assign_node_ids(node_id_t& node_count) | |
970 | { | |
971 | m_node_num = node_count++; | |
972 | } | |
973 | ||
974 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
975 | inline void | |
976 | ccl_node::dump(std::ostream& out) const | |
977 | { | |
978 | out << "\nccl_node m_match = "; | |
979 | for (std::size_t i = 0; i < m_match.size(); ++i) | |
980 | { | |
981 | if (m_match[i]) | |
982 | out << i << ", "; | |
983 | } | |
984 | out << " m_node_num = " << m_node_num; | |
985 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
986 | out << " firstpos() = "; | |
987 | node_set fp = firstpos(); | |
988 | std::copy(fp.begin(), fp.end(), | |
989 | std::ostream_iterator<node_id_t>(out, ",")); | |
990 | ||
991 | out << " lastpos() = "; | |
992 | node_set lp = lastpos(); | |
993 | std::copy(lp.begin(), lp.end(), | |
994 | std::ostream_iterator<node_id_t>(out, ",")); | |
995 | } | |
996 | #endif | |
997 | ||
998 | template <typename ScannerT> | |
999 | class make_concat | |
1000 | { | |
1001 | typedef typename ScannerT::iterator_t iterator_type; | |
1002 | ||
1003 | public: | |
1004 | ||
1005 | make_concat(std::stack<node*>& the_stack) | |
1006 | : m_stack(the_stack) | |
1007 | {} | |
1008 | ||
1009 | void operator()(iterator_type const &, iterator_type const &) const | |
1010 | { | |
1011 | node* right = m_stack.top(); | |
1012 | m_stack.pop(); | |
1013 | node* left = m_stack.top(); | |
1014 | m_stack.pop(); | |
1015 | node* newnode = new cat_node(left, right); | |
1016 | m_stack.push(newnode); | |
1017 | } | |
1018 | ||
1019 | std::stack<node*>& m_stack; | |
1020 | }; | |
1021 | ||
1022 | template <int CharTSize> | |
1023 | struct get_byte_aux; | |
1024 | ||
1025 | template<> | |
1026 | struct get_byte_aux<1> | |
1027 | { | |
1028 | template <typename CharT> | |
1029 | unsigned char operator()(CharT c, unsigned int byte) | |
1030 | { | |
1031 | BOOST_ASSERT(byte == 0); | |
1032 | return c; | |
1033 | } | |
1034 | }; | |
1035 | ||
1036 | template<> | |
1037 | struct get_byte_aux<2> | |
1038 | { | |
1039 | template <typename CharT> | |
1040 | unsigned char operator()(CharT c, unsigned int byte) | |
1041 | { | |
1042 | static unsigned long mask[] = | |
1043 | { | |
1044 | 0xFF00, | |
1045 | 0x00FF | |
1046 | }; | |
1047 | ||
1048 | BOOST_ASSERT(byte < 2); | |
1049 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1050 | } | |
1051 | }; | |
1052 | ||
1053 | template<> | |
1054 | struct get_byte_aux<4> | |
1055 | { | |
1056 | template <typename CharT> | |
1057 | unsigned char operator()(CharT c, unsigned int byte) | |
1058 | { | |
1059 | static unsigned long mask[] = | |
1060 | { | |
1061 | 0xFF000000, | |
1062 | 0x00FF0000, | |
1063 | 0x0000FF00, | |
1064 | 0x000000FF | |
1065 | }; | |
1066 | ||
1067 | BOOST_ASSERT(byte < 4); | |
1068 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1069 | } | |
1070 | }; | |
1071 | ||
1072 | template <typename CharT> | |
1073 | inline unsigned char | |
1074 | get_byte(CharT c, unsigned int byte) | |
1075 | { | |
1076 | return get_byte_aux<sizeof(c)>()(c, byte); | |
1077 | } | |
1078 | ||
1079 | template <typename ScannerT> | |
1080 | class make_star | |
1081 | { | |
1082 | typedef typename ScannerT::iterator_t iterator_type; | |
1083 | ||
1084 | public: | |
1085 | typedef | |
1086 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1087 | char_t; | |
1088 | ||
1089 | make_star(std::stack<node*>& the_stack) | |
1090 | : m_stack(the_stack) | |
1091 | {} | |
1092 | ||
1093 | void operator()(const char_t) const | |
1094 | { | |
1095 | node* left = m_stack.top(); | |
1096 | m_stack.pop(); | |
1097 | node* newnode = new star_node(left); | |
1098 | m_stack.push(newnode); | |
1099 | } | |
1100 | ||
1101 | std::stack<node*>& m_stack; | |
1102 | }; | |
1103 | ||
1104 | template <typename ScannerT> | |
1105 | class make_or | |
1106 | { | |
1107 | typedef typename ScannerT::iterator_t iterator_type; | |
1108 | ||
1109 | public: | |
1110 | ||
1111 | make_or(std::stack<node*>& the_stack) | |
1112 | : m_stack(the_stack) | |
1113 | {} | |
1114 | ||
1115 | void operator()(iterator_type const&, iterator_type const&) const | |
1116 | { | |
1117 | node* right = m_stack.top(); | |
1118 | m_stack.pop(); | |
1119 | node* left = m_stack.top(); | |
1120 | m_stack.pop(); | |
1121 | node* newnode = new or_node(left, right); | |
1122 | m_stack.push(newnode); | |
1123 | } | |
1124 | ||
1125 | std::stack<node*>& m_stack; | |
1126 | }; | |
1127 | ||
1128 | template <typename ScannerT> | |
1129 | class make_plus | |
1130 | { | |
1131 | typedef typename ScannerT::iterator_t iterator_type; | |
1132 | ||
1133 | public: | |
1134 | typedef | |
1135 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1136 | char_t; | |
1137 | ||
1138 | make_plus(std::stack<node*>& the_stack) | |
1139 | : m_stack(the_stack) | |
1140 | {} | |
1141 | ||
1142 | void operator()(const char_t) const | |
1143 | { | |
1144 | node* left = m_stack.top(); | |
1145 | m_stack.pop(); | |
1146 | ||
1147 | node* copy = left->clone(); | |
1148 | ||
1149 | node* new_star = new star_node(copy); | |
1150 | node* new_cat = new cat_node(left, new_star); | |
1151 | m_stack.push(new_cat); | |
1152 | } | |
1153 | ||
1154 | std::stack<node*>& m_stack; | |
1155 | }; | |
1156 | ||
1157 | template <typename ScannerT> | |
1158 | class make_optional | |
1159 | { | |
1160 | typedef typename ScannerT::iterator_t iterator_type; | |
1161 | ||
1162 | public: | |
1163 | typedef | |
1164 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1165 | char_t; | |
1166 | ||
1167 | make_optional(std::stack<node*>& the_stack) | |
1168 | : m_stack(the_stack) | |
1169 | {} | |
1170 | ||
1171 | void operator()(const char_t) const | |
1172 | { | |
1173 | node* left = m_stack.top(); | |
1174 | m_stack.pop(); | |
1175 | ||
1176 | node* new_or = new or_node(left, new epsilon_node()); | |
1177 | m_stack.push(new_or); | |
1178 | } | |
1179 | ||
1180 | std::stack<node*>& m_stack; | |
1181 | }; | |
1182 | ||
1183 | /////////////////////////////////////////////////////////////////////////////// | |
1184 | // utility function | |
1185 | template <typename CharT> | |
1186 | inline utility::impl::range<CharT> const& | |
1187 | full_range() | |
1188 | { | |
1189 | static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(), | |
1190 | (std::numeric_limits<CharT>::max)()); | |
1191 | return full; | |
1192 | } | |
1193 | ||
1194 | namespace ccl_utils | |
1195 | { | |
1196 | template <typename char_t> | |
1197 | inline utility::impl::range_run<char_t> | |
1198 | negate_range_run( | |
1199 | const utility::impl::range_run<char_t>& rr) | |
1200 | { | |
1201 | utility::impl::range_run<char_t> newrr; | |
1202 | newrr.set(full_range<char_t>()); | |
1203 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1204 | iter != rr.end(); ++iter) | |
1205 | newrr.clear(*iter); | |
1206 | return newrr; | |
1207 | } | |
1208 | ||
1209 | template <typename char_t> | |
1210 | inline node* | |
1211 | create_mb_node_seq(char_t c) | |
1212 | { | |
1213 | node* newnode = new char_node(get_byte(c, 0)); | |
1214 | for (unsigned int i = 1; i < sizeof(c); ++i) | |
1215 | { | |
1216 | node* cnode = new char_node(get_byte(c, i)); | |
1217 | node* top_node = new cat_node(newnode, cnode); | |
1218 | newnode = top_node; | |
1219 | } | |
1220 | return newnode; | |
1221 | } | |
1222 | ||
1223 | template <typename char_t> | |
1224 | inline void | |
1225 | handle_mb_char(char_t c, bool first_time, | |
1226 | std::stack<node*>& stack) | |
1227 | { | |
1228 | node* newnode = create_mb_node_seq(c); | |
1229 | ||
1230 | if (first_time) | |
1231 | { | |
1232 | stack.push(newnode); | |
1233 | } | |
1234 | else | |
1235 | { | |
1236 | node* top = stack.top(); | |
1237 | stack.pop(); | |
1238 | ||
1239 | node* newtop = new or_node(top, newnode); | |
1240 | stack.push(newtop); | |
1241 | } | |
1242 | } | |
1243 | ||
1244 | // forward decl only | |
1245 | template <typename char_t> | |
1246 | inline void | |
1247 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1248 | std::stack<node*>& stack); | |
1249 | ||
1250 | template <typename char_t> | |
1251 | inline void | |
1252 | create_nodes(const utility::impl::range_run<char_t>& rr, | |
1253 | std::stack<node*>& stack) | |
1254 | { | |
1255 | ||
1256 | if (sizeof(char_t) == 1) | |
1257 | { | |
1258 | std::vector<uchar> ccl; | |
1259 | ccl.resize(256); | |
1260 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1261 | iter != rr.end(); ++iter) | |
1262 | { | |
1263 | for (int i = iter->first; i <= iter->last; ++i) | |
1264 | { | |
1265 | // this is always true because of the limited datatype | |
1266 | // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256); | |
1267 | ccl[uchar(i)] = 1; | |
1268 | } | |
1269 | } | |
1270 | ||
1271 | node* new_ccl = new ccl_node(ccl); | |
1272 | stack.push(new_ccl); | |
1273 | } | |
1274 | else | |
1275 | { | |
1276 | bool mb_first_time = true; | |
1277 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1278 | iter != rr.end(); ++iter) | |
1279 | { | |
1280 | if (iter->first == iter->last) | |
1281 | { | |
1282 | handle_mb_char(iter->first, mb_first_time, stack); | |
1283 | } | |
1284 | else | |
1285 | { | |
1286 | handle_mb_range(iter->first, iter->last, mb_first_time, stack); | |
1287 | } | |
1288 | mb_first_time = false; | |
1289 | } | |
1290 | } | |
1291 | } | |
1292 | ||
1293 | template <typename char_t> | |
1294 | inline std::size_t | |
1295 | compute_differing_byte(char_t c1, char_t c2) | |
1296 | { | |
1297 | std::size_t rval = 0; | |
1298 | while (rval < sizeof(c1) && | |
1299 | get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval)) | |
1300 | { | |
1301 | ++rval; | |
1302 | } | |
1303 | return rval; | |
1304 | } | |
1305 | ||
1306 | template <typename char_t> | |
1307 | inline node* | |
1308 | create_mb_node_type1(std::size_t j, char_t c1, char_t c2) | |
1309 | { | |
1310 | std::size_t diff = get_byte(c2, (unsigned int)j) - | |
1311 | get_byte(c1, (unsigned int)j); | |
1312 | if (diff == 1) { | |
1313 | return 0; | |
1314 | } | |
1315 | else if (diff == 2) { | |
1316 | return new char_node(get_byte(c1, (unsigned int)j)+1); | |
1317 | } | |
1318 | else { | |
1319 | return new ccl_node(get_byte(c1, (unsigned int)j)+1, | |
1320 | get_byte(c2, (unsigned int)j)-1); | |
1321 | } | |
1322 | } | |
1323 | ||
1324 | template <typename char_t> | |
1325 | inline node * | |
1326 | create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1, | |
1327 | std::size_t differing_byte, char_t c1, char_t c2, node* newnode) | |
1328 | { | |
1329 | node* cnode; | |
1330 | if (i == sizem1 && j == differing_byte && j != sizem1) | |
1331 | { | |
1332 | node* tmp = create_mb_node_type1(j, c1, c2); | |
1333 | if (tmp == 0) | |
1334 | { | |
1335 | delete newnode; | |
1336 | return 0; | |
1337 | } | |
1338 | else | |
1339 | cnode = tmp; | |
1340 | } | |
1341 | else if (i == differing_byte && j == sizem1) | |
1342 | { | |
1343 | if (i != sizem1) { | |
1344 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF); | |
1345 | } | |
1346 | else { | |
1347 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), | |
1348 | get_byte(c2, (unsigned int)j)); | |
1349 | } | |
1350 | } | |
1351 | else if (i != differing_byte && i != sizem1 && | |
1352 | j == (sizem1 - i + differing_byte)) | |
1353 | { | |
1354 | cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF); | |
1355 | } | |
1356 | else if (i + j - differing_byte > sizem1) { | |
1357 | cnode = new ccl_node(0, 0xFF); | |
1358 | } | |
1359 | else {//if (is plain) | |
1360 | cnode = new char_node(get_byte(c1, (unsigned int)j)); | |
1361 | } | |
1362 | ||
1363 | node* top_node = new cat_node(newnode, cnode); | |
1364 | return top_node; | |
1365 | } | |
1366 | ||
1367 | // On platforms, where wchar_t is a typedef for unsigned short, the | |
1368 | // comparision for a negative value is pointless | |
1369 | template <bool is_signed> | |
1370 | struct correct_char_aux { | |
1371 | }; | |
1372 | ||
1373 | template <> | |
1374 | struct correct_char_aux<true> { | |
1375 | ||
1376 | template <typename char_t> | |
1377 | static char_t correct(char_t c) { if (c < 0) c = 0; return c; } | |
1378 | }; | |
1379 | ||
1380 | template <> | |
1381 | struct correct_char_aux<false> { | |
1382 | ||
1383 | template <typename char_t> | |
1384 | static char_t correct(char_t c) { return c; } | |
1385 | }; | |
1386 | ||
1387 | template <typename char_t> | |
1388 | struct correct_char | |
1389 | { | |
1390 | static char_t correct(char_t c) | |
1391 | { | |
1392 | return correct_char_aux<std::numeric_limits<char_t>::is_signed >:: | |
1393 | correct(c); | |
1394 | } | |
1395 | }; | |
1396 | ||
1397 | template <typename char_t> | |
1398 | inline void | |
1399 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1400 | std::stack<node*>& stack) | |
1401 | { | |
1402 | // The algorithm can't handle negative value chars, which don't make | |
1403 | // much sense anyway. This comparision is pointless for wchar_t's on | |
1404 | // platforms, where wchar_t is a typedef for unsigned short | |
1405 | ||
1406 | c1 = correct_char<char_t>::correct(c1); | |
1407 | //if (c1 < 0) | |
1408 | // c1 = 0; | |
1409 | ||
1410 | BOOST_ASSERT(c1 < c2); | |
1411 | node* newnode = 0; | |
1412 | node* savednode = 0; | |
1413 | const std::size_t differing_byte = compute_differing_byte(c1, c2); | |
1414 | const std::size_t sizem1 = sizeof(c1) - 1; | |
1415 | const std::size_t ndb = sizem1 - differing_byte; | |
1416 | for (std::size_t i = differing_byte; i < sizeof(c1); ++i) | |
1417 | { | |
1418 | // generate node for the first byte | |
1419 | if (differing_byte == 0 && i == ndb) | |
1420 | { | |
1421 | node* tmp = create_mb_node_type1(0, c1, c2); | |
1422 | if (tmp == 0) | |
1423 | continue; | |
1424 | else | |
1425 | newnode = tmp; | |
1426 | } | |
1427 | else | |
1428 | { | |
1429 | newnode = new char_node(get_byte(c1, 0)); | |
1430 | } | |
1431 | for (std::size_t j = 1; j < sizeof(c1); ++j) | |
1432 | { | |
1433 | newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, | |
1434 | c1, c2, newnode); | |
1435 | if (newnode == 0) | |
1436 | goto end_outer_for; | |
1437 | } | |
1438 | ||
1439 | // or together the various parts | |
1440 | if (savednode) | |
1441 | { | |
1442 | node* top_node = new or_node(savednode, newnode); | |
1443 | savednode = top_node; | |
1444 | } | |
1445 | else | |
1446 | { | |
1447 | savednode = newnode; | |
1448 | } | |
1449 | end_outer_for: | |
1450 | continue; | |
1451 | } | |
1452 | ||
1453 | for (std::size_t k = 0; k < ndb; ++k) | |
1454 | { | |
1455 | newnode = new char_node(get_byte(c2, 0)); | |
1456 | for (std::size_t j = 1; j < sizeof(c2); ++j) | |
1457 | { | |
1458 | node* cnode; | |
1459 | if (k == differing_byte && j == sizem1 && k != sizem1) | |
1460 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)); | |
1461 | ||
1462 | else if (k != differing_byte && k != sizem1 && | |
1463 | j == (sizem1 - k + differing_byte)) | |
1464 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1); | |
1465 | ||
1466 | else if (k + j - differing_byte > sizem1) | |
1467 | cnode = new ccl_node(0, 0xFF); | |
1468 | ||
1469 | else //if (is plain) | |
1470 | cnode = new char_node(get_byte(c2, (unsigned int)j)); | |
1471 | ||
1472 | ||
1473 | node* top_node = new cat_node(newnode, cnode); | |
1474 | newnode = top_node; | |
1475 | } | |
1476 | ||
1477 | // or together the various parts | |
1478 | if (savednode) | |
1479 | { | |
1480 | node* top_node = new or_node(savednode, newnode); | |
1481 | savednode = top_node; | |
1482 | } | |
1483 | else | |
1484 | { | |
1485 | savednode = newnode; | |
1486 | } | |
1487 | } | |
1488 | ||
1489 | ||
1490 | if (first_time) | |
1491 | { | |
1492 | stack.push(savednode); | |
1493 | } | |
1494 | else | |
1495 | { | |
1496 | node* top = stack.top(); | |
1497 | stack.pop(); | |
1498 | ||
1499 | node* newtop = new or_node(top, savednode); | |
1500 | stack.push(newtop); | |
1501 | } | |
1502 | } | |
1503 | } // namespace ccl_utils | |
1504 | ||
1505 | template <typename ScannerT> | |
1506 | class make_char | |
1507 | { | |
1508 | typedef typename ScannerT::iterator_t iterator_type; | |
1509 | ||
1510 | public: | |
1511 | typedef | |
1512 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1513 | char_t; | |
1514 | ||
1515 | make_char(std::stack<node*>& the_stack) | |
1516 | : m_stack(the_stack) | |
1517 | {} | |
1518 | ||
1519 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1520 | { | |
1521 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1522 | escape_char_parser<lex_escapes, char_t>(); | |
1523 | char_t the_char; | |
1524 | iterator_type first_ = first; | |
1525 | ScannerT scan(first_, last); | |
1526 | lex_escape_ch[assign(the_char)].parse(scan); | |
1527 | node* newnode = ccl_utils::create_mb_node_seq(the_char); | |
1528 | m_stack.push(newnode); | |
1529 | } | |
1530 | ||
1531 | std::stack<node*>& m_stack; | |
1532 | }; | |
1533 | ||
1534 | ||
1535 | template <typename ScannerT> | |
1536 | class make_ccl | |
1537 | { | |
1538 | typedef typename ScannerT::iterator_t iterator_type; | |
1539 | ||
1540 | public: | |
1541 | typedef | |
1542 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1543 | char_t; | |
1544 | ||
1545 | make_ccl(std::stack<node*>& the_stack) | |
1546 | : m_stack(the_stack) | |
1547 | {} | |
1548 | ||
1549 | static bool is_equal_to_string(iterator_type first, | |
1550 | iterator_type const & last, const char* str) | |
1551 | { | |
1552 | while (first != last &&*str &&*first ==*str) | |
1553 | { | |
1554 | ++first; | |
1555 | ++str; | |
1556 | } | |
1557 | return*str == 0; | |
1558 | } | |
1559 | ||
1560 | template <typename ParserT> | |
1561 | static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser) | |
1562 | { | |
1563 | for (int i = 0; i < 256; ++i) | |
1564 | { | |
1565 | if (parser.test(static_cast<char_t>(uchar(i)))) | |
1566 | rr.set(utility::impl::range<char_t>(char_t(i), char_t(i))); | |
1567 | } | |
1568 | } | |
1569 | ||
1570 | void operator()(iterator_type const& first_, iterator_type const& last) const | |
1571 | { | |
1572 | BOOST_ASSERT(*first_ == '['); | |
1573 | ||
1574 | iterator_type first = first_; | |
1575 | ++first; // skip over '[' | |
1576 | bool negated_ccl = false; | |
1577 | if (*first == '^') | |
1578 | { | |
1579 | negated_ccl = true; | |
1580 | ++first; | |
1581 | } | |
1582 | ||
1583 | utility::impl::range_run<char_t> rr; | |
1584 | while (first != last &&*first != ']') | |
1585 | { | |
1586 | if (*first == '[') // it's a ccl_expr like [:space:] | |
1587 | { | |
1588 | // check for [:space:], etc. | |
1589 | if (is_equal_to_string(first, last, "[:alnum:]")) | |
1590 | { | |
1591 | fill_ccl(rr, alnum_p); | |
1592 | } | |
1593 | else if (is_equal_to_string(first, last, "[:alpha:]")) | |
1594 | { | |
1595 | fill_ccl(rr, alpha_p); | |
1596 | } | |
1597 | else if (is_equal_to_string(first, last, "[:blank:]")) | |
1598 | { | |
1599 | fill_ccl(rr, blank_p); | |
1600 | } | |
1601 | else if (is_equal_to_string(first, last, "[:cntrl:]")) | |
1602 | { | |
1603 | fill_ccl(rr, cntrl_p); | |
1604 | } | |
1605 | else if (is_equal_to_string(first, last, "[:digit:]")) | |
1606 | { | |
1607 | fill_ccl(rr, digit_p); | |
1608 | } | |
1609 | else if (is_equal_to_string(first, last, "[:graph:]")) | |
1610 | { | |
1611 | fill_ccl(rr, graph_p); | |
1612 | } | |
1613 | else if (is_equal_to_string(first, last, "[:lower:]")) | |
1614 | { | |
1615 | fill_ccl(rr, lower_p); | |
1616 | } | |
1617 | else if (is_equal_to_string(first, last, "[:print:]")) | |
1618 | { | |
1619 | fill_ccl(rr, print_p); | |
1620 | } | |
1621 | else if (is_equal_to_string(first, last, "[:punct:]")) | |
1622 | { | |
1623 | fill_ccl(rr, punct_p); | |
1624 | } | |
1625 | else if (is_equal_to_string(first, last, "[:space:]")) | |
1626 | { | |
1627 | fill_ccl(rr, space_p); | |
1628 | } | |
1629 | else if (is_equal_to_string(first, last, "[:upper:]")) | |
1630 | { | |
1631 | fill_ccl(rr, upper_p); | |
1632 | } | |
1633 | else if (is_equal_to_string(first, last, "[:xdigit:]")) | |
1634 | { | |
1635 | fill_ccl(rr, xdigit_p); | |
1636 | } | |
1637 | // this can't happen, because it's parsed before we get here. | |
1638 | //else | |
1639 | // throw bad_regex(); | |
1640 | ||
1641 | // Advance past the character class expression | |
1642 | while (first != last &&*first != ']') | |
1643 | ++first; | |
1644 | BOOST_ASSERT(*first == ']'); | |
1645 | ++first; | |
1646 | } | |
1647 | else { | |
1648 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1649 | escape_char_parser<lex_escapes, char_t>(); | |
1650 | ||
1651 | char_t c1; | |
1652 | ScannerT scan(first, last); | |
1653 | lex_escape_ch[assign(c1)].parse(scan); | |
1654 | if (*scan.first == '-') // insert a range | |
1655 | { | |
1656 | ++scan.first; | |
1657 | char_t c2; | |
1658 | lex_escape_ch[assign(c2)].parse(scan); | |
1659 | BOOST_ASSERT(c1 < c2); // Throw exception? | |
1660 | rr.set(utility::impl::range<char_t>(c1, c2)); | |
1661 | } | |
1662 | else // insert 1 char | |
1663 | { | |
1664 | rr.set(utility::impl::range<char_t>(c1, c1)); | |
1665 | } | |
1666 | } | |
1667 | } | |
1668 | ||
1669 | if (negated_ccl) | |
1670 | { | |
1671 | rr = ccl_utils::negate_range_run(rr); | |
1672 | } | |
1673 | ||
1674 | ccl_utils::create_nodes(rr, m_stack); | |
1675 | } | |
1676 | ||
1677 | std::stack<node*>& m_stack; | |
1678 | }; | |
1679 | ||
1680 | template <typename ScannerT> | |
1681 | class make_any_char | |
1682 | { | |
1683 | typedef typename ScannerT::iterator_t iterator_type; | |
1684 | ||
1685 | public: | |
1686 | typedef | |
1687 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1688 | char_t; | |
1689 | ||
1690 | std::stack<node*>& m_stack; | |
1691 | ||
1692 | make_any_char(std::stack<node*>& the_stack) | |
1693 | : m_stack(the_stack) | |
1694 | {} | |
1695 | ||
1696 | void operator()(const char_t c) const | |
1697 | { | |
1698 | BOOST_ASSERT(c == '.'); | |
1699 | do_any_char(); | |
1700 | } | |
1701 | ||
1702 | void do_any_char() const | |
1703 | { | |
1704 | static utility::impl::range_run<char_t> rr; | |
1705 | rr.set(full_range<char_t>()); | |
1706 | char_t newline = '\n'; | |
1707 | rr.clear(utility::impl::range<char_t>(newline, newline)); | |
1708 | ||
1709 | ccl_utils::create_nodes(rr, m_stack); | |
1710 | } | |
1711 | }; | |
1712 | ||
1713 | template <typename ScannerT> | |
1714 | class make_string | |
1715 | { | |
1716 | typedef typename ScannerT::iterator_t iterator_type; | |
1717 | ||
1718 | public: | |
1719 | typedef | |
1720 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1721 | char_t; | |
1722 | ||
1723 | std::stack<node*>& m_stack; | |
1724 | ||
1725 | make_string(std::stack<node*>& the_stack) | |
1726 | : m_stack(the_stack) | |
1727 | {} | |
1728 | ||
1729 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1730 | { | |
1731 | BOOST_ASSERT(*first == '"'); | |
1732 | ||
1733 | iterator_type first_ = first; | |
1734 | ScannerT scan(first_, last); | |
1735 | ++scan.first; // skip over '"' | |
1736 | ||
1737 | // empty string not allowed | |
1738 | if (*scan.first == '"') | |
1739 | { | |
1740 | boost::throw_exception(bad_regex()); | |
1741 | } | |
1742 | ||
1743 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1744 | escape_char_parser<lex_escapes, char_t>(); | |
1745 | ||
1746 | char_t c; | |
1747 | lex_escape_ch[assign(c)].parse(scan); | |
1748 | node* top_node = ccl_utils::create_mb_node_seq(c); | |
1749 | ||
1750 | while (*scan.first != '"' && scan.first != scan.last) | |
1751 | { | |
1752 | lex_escape_ch[assign(c)].parse(scan); | |
1753 | node* cur_node = ccl_utils::create_mb_node_seq(c); | |
1754 | top_node = new cat_node(top_node, cur_node); | |
1755 | } | |
1756 | m_stack.push(top_node); | |
1757 | } | |
1758 | }; | |
1759 | ||
1760 | inline | |
1761 | node* repeat_node(node* n, int num) | |
1762 | { | |
1763 | node* list_of_nodes = n; | |
1764 | for (int i = 1; i < num; ++i) | |
1765 | { | |
1766 | list_of_nodes = new cat_node(list_of_nodes, n->clone()); | |
1767 | } | |
1768 | return list_of_nodes; | |
1769 | } | |
1770 | ||
1771 | inline | |
1772 | node* optional_node(node* n) | |
1773 | { | |
1774 | return new or_node(n, new epsilon_node()); | |
1775 | } | |
1776 | ||
1777 | template <typename ScannerT> | |
1778 | class make_rep1 | |
1779 | { | |
1780 | typedef typename ScannerT::iterator_t iterator_type; | |
1781 | ||
1782 | public: | |
1783 | typedef | |
1784 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1785 | char_t; | |
1786 | ||
1787 | std::stack<node*>& m_stack; | |
1788 | ||
1789 | make_rep1(std::stack<node*>& the_stack) | |
1790 | : m_stack(the_stack) | |
1791 | {} | |
1792 | ||
1793 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1794 | { | |
1795 | BOOST_ASSERT(*first == '{'); | |
1796 | ||
1797 | iterator_type first_ = first; | |
1798 | ScannerT scan(first_, last); | |
1799 | ++scan.first; // skip over '{' | |
1800 | ||
1801 | unsigned int count; | |
1802 | uint_p[assign(count)].parse(scan); | |
1803 | if (count == 0) | |
1804 | boost::throw_exception(bad_regex()); | |
1805 | ||
1806 | node* top_node = m_stack.top(); | |
1807 | m_stack.pop(); | |
1808 | top_node = repeat_node(top_node, count); | |
1809 | m_stack.push(top_node); | |
1810 | } | |
1811 | }; | |
1812 | ||
1813 | template <typename ScannerT> | |
1814 | class make_rep2 | |
1815 | { | |
1816 | typedef typename ScannerT::iterator_t iterator_type; | |
1817 | ||
1818 | public: | |
1819 | typedef | |
1820 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1821 | char_t; | |
1822 | ||
1823 | std::stack<node*>& m_stack; | |
1824 | ||
1825 | make_rep2(std::stack<node*>& the_stack) | |
1826 | : m_stack(the_stack) | |
1827 | {} | |
1828 | ||
1829 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1830 | { | |
1831 | BOOST_ASSERT(*first == '{'); | |
1832 | ||
1833 | iterator_type first_ = first; | |
1834 | ScannerT scan (first_, last); | |
1835 | ++scan.first; // skip over '{' | |
1836 | ||
1837 | unsigned int count; | |
1838 | uint_p[assign(count)].parse(scan); | |
1839 | if (count == 0) | |
1840 | boost::throw_exception(bad_regex()); | |
1841 | ||
1842 | node* top_node = m_stack.top(); | |
1843 | m_stack.pop(); | |
1844 | top_node = new cat_node(repeat_node(top_node, count), | |
1845 | new star_node(top_node->clone())); | |
1846 | m_stack.push(top_node); | |
1847 | ||
1848 | } | |
1849 | }; | |
1850 | ||
1851 | template <typename ScannerT> | |
1852 | class make_rep3 | |
1853 | { | |
1854 | typedef typename ScannerT::iterator_t iterator_type; | |
1855 | ||
1856 | public: | |
1857 | typedef | |
1858 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1859 | char_t; | |
1860 | ||
1861 | std::stack<node*>& m_stack; | |
1862 | ||
1863 | make_rep3(std::stack<node*>& the_stack) | |
1864 | : m_stack(the_stack) | |
1865 | {} | |
1866 | ||
1867 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1868 | { | |
1869 | BOOST_ASSERT(*first == '{'); | |
1870 | ||
1871 | iterator_type first_ = first; | |
1872 | ScannerT scan(first_, last); | |
1873 | ++scan.first; // skip over '{' | |
1874 | ||
1875 | unsigned int count1, count2; | |
1876 | uint_p[assign(count1)].parse(scan); | |
1877 | if (count1 == 0) | |
1878 | boost::throw_exception(bad_regex()); | |
1879 | ||
1880 | ++scan.first; // skip over ',' | |
1881 | ||
1882 | uint_p[assign(count2)].parse(scan); | |
1883 | if (count2 <= count1) | |
1884 | boost::throw_exception(bad_regex()); | |
1885 | ||
1886 | node* top_node = m_stack.top(); | |
1887 | m_stack.pop(); | |
1888 | node* repeats = repeat_node(top_node, count1); | |
1889 | top_node = new cat_node(repeats, | |
1890 | repeat_node(optional_node(top_node->clone()), | |
1891 | count2 - count1)); | |
1892 | ||
1893 | m_stack.push(top_node); | |
1894 | } | |
1895 | }; | |
1896 | ||
1897 | /////////////////////////////////////////////////////////////////////////////// | |
1898 | // | |
1899 | // Lexer grammar | |
1900 | // | |
1901 | // Defines the grammar, which mandates the syntax of the understood | |
1902 | // lexeme definitions passed to lexer::register_regex. | |
1903 | // | |
1904 | /////////////////////////////////////////////////////////////////////////////// | |
1905 | class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar> | |
1906 | { | |
1907 | public: | |
1908 | lexer_grammar(std::stack<node*> &node_stack_) | |
1909 | : node_stack(node_stack_) {} | |
1910 | ||
1911 | template <typename ScannerT> | |
1912 | struct definition | |
1913 | { | |
1914 | typedef rule<ScannerT> rule_t; | |
1915 | typedef typename ScannerT::iterator_t iterator_type; | |
1916 | typedef | |
1917 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1918 | char_t; | |
1919 | ||
1920 | rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, | |
1921 | escseq, ccl_char; | |
1922 | symbols<> ccl_expr; | |
1923 | ||
1924 | definition(lexer_grammar const &self) | |
1925 | { | |
1926 | regex = | |
1927 | re >> !('/' >> re) >> !ch_p('$') | |
1928 | ; | |
1929 | ||
1930 | re = | |
1931 | series | |
1932 | >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] ) | |
1933 | ; | |
1934 | ||
1935 | series = | |
1936 | singleton | |
1937 | >>*( singleton[make_concat<ScannerT>(self.node_stack)] ) | |
1938 | ; | |
1939 | ||
1940 | singleton = | |
1941 | ch_p('.')[make_any_char<ScannerT>(self.node_stack)] | |
1942 | >> singleton2 | |
1943 | | fullccl | |
1944 | >> singleton2 | |
1945 | | ('"' >> string >> '"') | |
1946 | [ | |
1947 | make_string<ScannerT>(self.node_stack) | |
1948 | ] | |
1949 | >> singleton2 | |
1950 | | '(' >> re >> ')' | |
1951 | >> singleton2 | |
1952 | | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) | |
1953 | [ | |
1954 | make_char<ScannerT>(self.node_stack) | |
1955 | ] | |
1956 | >> singleton2 | |
1957 | ; | |
1958 | ||
1959 | singleton2 = | |
1960 | ch_p('*')[make_star<ScannerT>(self.node_stack)] | |
1961 | >> singleton2 | |
1962 | | ch_p('+')[make_plus<ScannerT>(self.node_stack)] | |
1963 | >> singleton2 | |
1964 | | ch_p('?')[make_optional<ScannerT>(self.node_stack)] | |
1965 | >> singleton2 | |
1966 | | ('{' >> uint_p >> '}') | |
1967 | [ | |
1968 | make_rep1<ScannerT>(self.node_stack) | |
1969 | ] | |
1970 | >> singleton2 | |
1971 | | ('{' >> uint_p >> ',' >> '}') | |
1972 | [ | |
1973 | make_rep2<ScannerT>(self.node_stack) | |
1974 | ] | |
1975 | >> singleton2 | |
1976 | | ('{' >> uint_p >> ',' >> uint_p >> '}') | |
1977 | [ | |
1978 | make_rep3<ScannerT>(self.node_stack) | |
1979 | ] | |
1980 | >> singleton2 | |
1981 | | epsilon_p | |
1982 | ; | |
1983 | ||
1984 | fullccl = | |
1985 | ('[' >> !ch_p('^') >> ccl >> ']') | |
1986 | [ | |
1987 | make_ccl<ScannerT>(self.node_stack) | |
1988 | ] | |
1989 | ; | |
1990 | ||
1991 | ccl = | |
1992 | *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) | |
1993 | ; | |
1994 | ||
1995 | ccl_char = | |
1996 | ( (anychar_p - chset<>("\\\n]")) | escseq ) | |
1997 | ; | |
1998 | ||
1999 | ccl_expr = | |
2000 | "[:alnum:]", | |
2001 | "[:alpha:]", | |
2002 | "[:blank:]", | |
2003 | "[:cntrl:]", | |
2004 | "[:digit:]", | |
2005 | "[:graph:]", | |
2006 | "[:lower:]", | |
2007 | "[:print:]", | |
2008 | "[:punct:]", | |
2009 | "[:space:]", | |
2010 | "[:upper:]", | |
2011 | "[:xdigit:]" | |
2012 | ; | |
2013 | ||
2014 | string = | |
2015 | +( (anychar_p - chset<>("\"\\")) | escseq ) | |
2016 | ; | |
2017 | ||
2018 | typedef | |
2019 | uint_parser<char_t, 8, 1, | |
2020 | std::numeric_limits<char_t>::digits / 3 + 1 | |
2021 | > oct_parser_t; | |
2022 | typedef | |
2023 | uint_parser<char_t, 16, 1, | |
2024 | std::numeric_limits<char_t>::digits / 4 + 1 | |
2025 | > hex_parser_t; | |
2026 | ||
2027 | escseq = | |
2028 | ch_p('\\') | |
2029 | >> ( | |
2030 | oct_parser_t() | |
2031 | | as_lower_d['x'] >> hex_parser_t() | |
2032 | | (anychar_p - chset<>('\n')) | |
2033 | ) | |
2034 | ; | |
2035 | ||
2036 | #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2037 | ||
2038 | BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); | |
2039 | BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); | |
2040 | BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); | |
2041 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); | |
2042 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); | |
2043 | BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); | |
2044 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); | |
2045 | BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); | |
2046 | BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); | |
2047 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); | |
2048 | ||
2049 | #undef BOOST_SLEX_DEBUG | |
2050 | } | |
2051 | ||
2052 | rule<ScannerT> const& | |
2053 | start() const { return regex; } | |
2054 | }; | |
2055 | ||
2056 | std::stack<node*> &node_stack; | |
2057 | ||
2058 | }; // class lexer_grammar | |
2059 | ||
2060 | template <typename StringT> | |
2061 | inline node * | |
2062 | parse(lexer_grammar& g, StringT const& str) | |
2063 | { | |
2064 | typedef | |
2065 | scanner<typename StringT::const_iterator, scanner_policies<> > | |
2066 | scanner_t; | |
2067 | typedef typename rule<scanner_t>::template result<scanner_t>::type | |
2068 | result_t; | |
2069 | ||
2070 | typename StringT::const_iterator first = str.begin(); | |
2071 | typename StringT::const_iterator last = str.end(); | |
2072 | ||
2073 | scanner_t scan(first, last); | |
2074 | // typename rule<scanner_t>::result_t hit = g.parse(scan); | |
2075 | result_t hit = g.parse(scan); | |
2076 | if (!hit || !scan.at_end()) | |
2077 | { | |
2078 | while (g.node_stack.size()) | |
2079 | { | |
2080 | delete g.node_stack.top(); | |
2081 | g.node_stack.pop(); | |
2082 | } | |
2083 | return 0; | |
2084 | } | |
2085 | ||
2086 | BOOST_ASSERT(g.node_stack.size() == 1); | |
2087 | node* rval = g.node_stack.top(); | |
2088 | g.node_stack.pop(); | |
2089 | node* an_eof_node = new eof_node(); | |
2090 | rval = new cat_node(rval, an_eof_node); | |
2091 | return rval; | |
2092 | } | |
2093 | ||
2094 | inline | |
2095 | void make_case_insensitive(state_match_t& state_match) | |
2096 | { | |
2097 | // TODO: Fix this. | |
2098 | // This doesn't take into account foreign languages, figure out how to | |
2099 | // do that. Also this approach is broken for this implementation of | |
2100 | // wide chars. | |
2101 | for (state_match_t::iterator iter = state_match.begin(); | |
2102 | iter != state_match.end(); ++iter) | |
2103 | { | |
2104 | int i, j; | |
2105 | for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) | |
2106 | { | |
2107 | // if either is set, turn them both on | |
2108 | (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); | |
2109 | } | |
2110 | } | |
2111 | } | |
2112 | ||
2113 | template<bool wide_char> | |
2114 | struct regex_match_helper; | |
2115 | ||
2116 | template<> | |
2117 | struct regex_match_helper<false> // single byte char | |
2118 | { | |
2119 | template <typename DfaT, typename IteratorT> | |
2120 | static bool | |
2121 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2122 | int& regex_index, | |
2123 | std::basic_string< | |
2124 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2125 | > *token) | |
2126 | { | |
2127 | typedef std::basic_string< | |
2128 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2129 | > string_type; | |
2130 | typedef typename string_type::size_type size_type; | |
2131 | ||
2132 | node_id_t s = 0; | |
2133 | node_id_t last_accepting_index = invalid_node; | |
2134 | IteratorT p = first; | |
2135 | IteratorT last_accepting_cpos = first; | |
2136 | while (p != last) | |
2137 | { | |
2138 | s = dfa.transition_table[s][(uchar)*p]; | |
2139 | if (s == invalid_node) | |
2140 | break; | |
2141 | if (token) token->append((size_type)1, *p); | |
2142 | ++p; | |
2143 | if (dfa.acceptance_index[s] != invalid_node) | |
2144 | { | |
2145 | last_accepting_index = s; | |
2146 | last_accepting_cpos = p; | |
2147 | } | |
2148 | } | |
2149 | if (last_accepting_index != invalid_node) | |
2150 | { | |
2151 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2152 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2153 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2154 | #endif | |
2155 | ||
2156 | first = last_accepting_cpos; | |
2157 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2158 | return true; | |
2159 | } | |
2160 | else | |
2161 | return false; | |
2162 | } | |
2163 | }; | |
2164 | ||
2165 | template<> | |
2166 | struct regex_match_helper<true> // wide char | |
2167 | { | |
2168 | template <typename DfaT, typename IteratorT> | |
2169 | static bool | |
2170 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2171 | int& regex_index, | |
2172 | std::basic_string< | |
2173 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2174 | > *token) | |
2175 | { | |
2176 | typedef | |
2177 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2178 | char_t; | |
2179 | typedef std::basic_string<char_t> string_type; | |
2180 | typedef typename string_type::size_type size_type; | |
2181 | ||
2182 | node_id_t s = 0; | |
2183 | node_id_t last_accepting_index = invalid_node; | |
2184 | IteratorT wp = first; | |
2185 | IteratorT last_accepting_cpos = first; | |
2186 | ||
2187 | while (wp != last) | |
2188 | { | |
2189 | for (unsigned int i = 0; i < sizeof(char_t); ++i) | |
2190 | { | |
2191 | s = dfa.transition_table[s][get_byte(*wp,i)]; | |
2192 | if (s == invalid_node) | |
2193 | { | |
2194 | goto break_while; | |
2195 | } | |
2196 | } | |
2197 | if (token) token->append((size_type)1, *wp); | |
2198 | ++wp; | |
2199 | if (dfa.acceptance_index[s] != invalid_node) | |
2200 | { | |
2201 | last_accepting_index = s; | |
2202 | last_accepting_cpos = wp; | |
2203 | } | |
2204 | ||
2205 | } | |
2206 | ||
2207 | break_while: | |
2208 | if (last_accepting_index != invalid_node) | |
2209 | { | |
2210 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2211 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2212 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2213 | #endif | |
2214 | first = last_accepting_cpos; | |
2215 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2216 | ||
2217 | return true; | |
2218 | } | |
2219 | else | |
2220 | return false; | |
2221 | } | |
2222 | }; | |
2223 | ||
2224 | template <typename DfaT, typename IteratorT, bool wide_char> | |
2225 | struct regex_match | |
2226 | { | |
2227 | static bool | |
2228 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2229 | int& regex_index, | |
2230 | std::basic_string< | |
2231 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2232 | > *token) | |
2233 | { | |
2234 | return regex_match_helper<wide_char>::do_match( | |
2235 | dfa, first, last, regex_index, token); | |
2236 | } | |
2237 | }; | |
2238 | ||
2239 | } // namespace lexerimpl | |
2240 | ||
2241 | /////////////////////////////////////////////////////////////////////////////// | |
2242 | // | |
2243 | template <typename IteratorT = char const*, typename TokenT = int, | |
2244 | typename CallbackT = void(*)(IteratorT const &, | |
2245 | IteratorT &, | |
2246 | IteratorT const&, | |
2247 | TokenT const&, | |
2248 | lexer_control<TokenT> &)> | |
2249 | class lexer | |
2250 | { | |
2251 | public: | |
2252 | typedef CallbackT callback_t; | |
2253 | typedef | |
2254 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2255 | char_t; | |
2256 | ||
2257 | struct regex_info | |
2258 | { | |
2259 | std::basic_string<char_t> str; | |
2260 | TokenT token; | |
2261 | CallbackT callback; | |
2262 | ||
2263 | regex_info(const std::basic_string<char_t>& _str, | |
2264 | const TokenT& _token, | |
2265 | const CallbackT& _callback) | |
2266 | : str(_str) | |
2267 | , token(_token) | |
2268 | , callback(_callback) | |
2269 | {} | |
2270 | ||
2271 | }; | |
2272 | ||
2273 | struct dfa_table | |
2274 | { | |
2275 | std::vector<std::vector<node_id_t> > transition_table; | |
2276 | std::vector<node_id_t> acceptance_index; | |
2277 | }; | |
2278 | typedef std::vector<node_id_t> node_table_t; | |
2279 | typedef std::vector<node_table_t> transition_table_t; | |
2280 | typedef std::vector<dfa_table> dfa_t; | |
2281 | ||
2282 | ||
2283 | lexer(unsigned int states = 1); | |
2284 | ||
2285 | void register_regex(const std::basic_string<char_t>& regex, | |
2286 | const TokenT& id, const CallbackT& cb = CallbackT(), | |
2287 | unsigned int state = 0); | |
2288 | ||
2289 | TokenT next_token(IteratorT &first, IteratorT const &last, | |
2290 | std::basic_string<char_t> *token = 0); | |
2291 | ||
2292 | void create_dfa(); | |
2293 | bool has_compiled_dfa() { return m_compiled_dfa; } | |
2294 | ||
2295 | void set_case_insensitive(bool insensitive); | |
2296 | ||
2297 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2298 | void dump(std::ostream& out); | |
2299 | #endif | |
2300 | typedef std::vector<std::vector<regex_info> > regex_list_t; | |
2301 | ||
2302 | bool load (std::ifstream &in, long unique_id = 0); | |
2303 | bool save (std::ofstream &out, long unique_id = 0); | |
2304 | enum { | |
2305 | SLEX_SIGNATURE = 0x58454C53, // "SLEX" | |
2306 | SLEX_VERSION_100 = 0x0100, // persistance version | |
2307 | SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, | |
2308 | SLEX_MINOR_VERSION_MASK = 0xFF | |
2309 | }; | |
2310 | ||
2311 | private: | |
2312 | ||
2313 | void create_dfa_for_state(int state); | |
2314 | ||
2315 | static bool regex_match(const dfa_t& dfa, IteratorT& first, | |
2316 | IteratorT& last, int& regex_index); | |
2317 | ||
2318 | mutable std::stack<lexerimpl::node*> node_stack; | |
2319 | lexerimpl::lexer_grammar g; | |
2320 | ||
2321 | mutable bool m_compiled_dfa; | |
2322 | mutable dfa_t m_dfa; | |
2323 | ||
2324 | regex_list_t m_regex_list; | |
2325 | bool m_case_insensitive; | |
2326 | ||
2327 | unsigned int m_state; | |
2328 | std::stack<unsigned int> m_state_stack; | |
2329 | unsigned int m_num_states; | |
2330 | }; | |
2331 | ||
2332 | ||
2333 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2334 | inline | |
2335 | lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) | |
2336 | : g(node_stack) | |
2337 | , m_compiled_dfa(false) | |
2338 | , m_regex_list(states) | |
2339 | , m_case_insensitive(false) | |
2340 | , m_state(0) | |
2341 | , m_state_stack() | |
2342 | , m_num_states(states) | |
2343 | { | |
2344 | BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", | |
2345 | BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX); | |
2346 | } | |
2347 | ||
2348 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2349 | inline void | |
2350 | lexer<IteratorT, TokenT, CallbackT>::register_regex( | |
2351 | const std::basic_string<char_t>& regex, const TokenT& id, | |
2352 | const CallbackT& callback, unsigned int state) | |
2353 | { | |
2354 | if (state > m_num_states) { | |
2355 | m_regex_list.resize(state); | |
2356 | m_num_states = state; | |
2357 | } | |
2358 | m_regex_list[state].push_back(regex_info(regex, id, callback)); | |
2359 | } | |
2360 | ||
2361 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2362 | inline TokenT | |
2363 | lexer<IteratorT, TokenT, CallbackT>::next_token( | |
2364 | IteratorT &first, IteratorT const& last, | |
2365 | std::basic_string< | |
2366 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2367 | > *token) | |
2368 | { | |
2369 | if (!m_compiled_dfa) | |
2370 | { | |
2371 | create_dfa(); | |
2372 | } | |
2373 | ||
2374 | IteratorT saved = first; | |
2375 | int regex_index; | |
2376 | if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: | |
2377 | do_match(m_dfa[m_state], first, last, regex_index, token)) | |
2378 | return -1; // TODO: can't return -1, need to return some invalid token. | |
2379 | // how to figure this out? We can use traits I guess. | |
2380 | else | |
2381 | { | |
2382 | regex_info regex = m_regex_list[m_state][regex_index]; | |
2383 | TokenT rval = regex.token; | |
2384 | if (regex.callback) | |
2385 | { | |
2386 | // execute corresponding callback | |
2387 | lexer_control<TokenT> controller(rval, m_state, m_state_stack); | |
2388 | regex.callback(saved, first, last, regex.token, controller); | |
2389 | if (controller.ignore_current_token_set()) { | |
2390 | if (token) | |
2391 | token->erase(); | |
2392 | return next_token(first, last, token); | |
2393 | } | |
2394 | } | |
2395 | return rval; | |
2396 | } | |
2397 | } | |
2398 | ||
2399 | namespace lexerimpl | |
2400 | { | |
2401 | ||
2402 | inline | |
2403 | bool find_acceptance_state(const node_set& eof_node_ids, | |
2404 | const node_set& current_set, | |
2405 | node_id_t& acceptance_node_id) | |
2406 | { | |
2407 | for(node_set::const_iterator nsi = eof_node_ids.begin(); | |
2408 | nsi != eof_node_ids.end(); ++nsi) | |
2409 | { | |
2410 | node_id_t eof_node_id =*nsi; | |
2411 | if (current_set.end() != current_set.find(eof_node_id)) | |
2412 | { | |
2413 | // store the first one we come to as the | |
2414 | // matched pattern | |
2415 | acceptance_node_id = eof_node_id; | |
2416 | // don't bother searching for more | |
2417 | return true; | |
2418 | } | |
2419 | } | |
2420 | return false; | |
2421 | } | |
2422 | ||
2423 | template <typename RegexListT, typename GrammarT> | |
92f5a8d4 | 2424 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
2425 | inline std::unique_ptr<node> |
2426 | #else | |
7c673cae | 2427 | inline std::auto_ptr<node> |
b32b8144 | 2428 | #endif |
7c673cae FG |
2429 | parse_regexes(const RegexListT& regex_list, GrammarT& g) |
2430 | { | |
2431 | // parse the expressions into a tree | |
2432 | if (regex_list.begin() == regex_list.end()) | |
2433 | boost::throw_exception(bad_regex()); | |
2434 | ||
2435 | typename RegexListT::const_iterator ri = regex_list.begin(); | |
92f5a8d4 | 2436 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
2437 | std::unique_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); |
2438 | #else | |
7c673cae | 2439 | std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); |
b32b8144 | 2440 | #endif |
7c673cae FG |
2441 | if (tree.get() == 0) |
2442 | boost::throw_exception(bad_regex()); | |
2443 | ||
2444 | ++ri; | |
2445 | for (/**/; ri != regex_list.end(); ++ri) | |
2446 | { | |
92f5a8d4 | 2447 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
2448 | std::unique_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); |
2449 | #else | |
7c673cae | 2450 | std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); |
b32b8144 | 2451 | #endif |
7c673cae FG |
2452 | if (next_tree.get() == 0) |
2453 | boost::throw_exception(bad_regex()); | |
92f5a8d4 | 2454 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
2455 | tree = std::unique_ptr<node>(new or_node(tree.release(), next_tree.release())); |
2456 | #else | |
2457 | tree = std::auto_ptr<node>(new or_node(tree.release(), next_tree.release())); | |
2458 | #endif | |
7c673cae FG |
2459 | } |
2460 | return tree; | |
2461 | } | |
2462 | ||
2463 | } //namespace lexerimpl | |
2464 | ||
2465 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2466 | inline void | |
2467 | lexer<IteratorT, TokenT, CallbackT>::create_dfa() | |
2468 | { | |
2469 | m_dfa.resize(m_num_states); | |
2470 | for (unsigned int i = 0; i < m_num_states; ++i) | |
2471 | create_dfa_for_state(i); | |
2472 | } | |
2473 | ||
2474 | // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 | |
2475 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2476 | inline void | |
2477 | lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state) | |
2478 | { | |
2479 | using lexerimpl::node; | |
92f5a8d4 | 2480 | #ifndef BOOST_NO_CXX11_SMART_PTR |
b32b8144 FG |
2481 | std::unique_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); |
2482 | #else | |
7c673cae | 2483 | std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); |
b32b8144 | 2484 | #endif |
7c673cae FG |
2485 | node_id_t dummy = 0; |
2486 | tree->assign_node_ids(dummy); | |
2487 | ||
2488 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2489 | tree->dump(std::cout); | |
2490 | #endif | |
2491 | ||
2492 | // compute followpos(root) | |
2493 | followpos_t followpos; | |
2494 | tree->compute_followpos(followpos); | |
2495 | ||
2496 | // the dfa states <-> nfa state groups | |
2497 | std::map<node_set, node_id_t> dstates1; | |
2498 | std::map<node_id_t, node_set> dstates2; | |
2499 | ||
2500 | // the dfa transitions | |
2501 | m_dfa[state].transition_table.push_back( | |
2502 | std::vector<node_id_t>(256, invalid_node)); | |
2503 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2504 | ||
2505 | // whether the dfa state has been processed yet | |
2506 | std::vector<node_id_t> marked; | |
2507 | ||
2508 | // used to give a unique id to each dfa state | |
2509 | node_id_t num_states = 0; | |
2510 | ||
2511 | // initially, the only unmarked state in Dstates is firstpos(root). | |
2512 | marked.push_back(0); | |
2513 | node_set fpr = tree->firstpos(); | |
2514 | dstates1[fpr] = 0; | |
2515 | dstates2[0] = fpr; | |
2516 | state_match_t state_match; | |
2517 | tree->compute_state_match(state_match); | |
2518 | ||
2519 | if (m_case_insensitive) | |
2520 | lexerimpl::make_case_insensitive(state_match); | |
2521 | ||
2522 | node_set eof_node_ids; | |
2523 | tree->get_eof_ids(eof_node_ids); | |
2524 | // translate the eof_node_ids into a 0-based index | |
2525 | std::map<node_id_t, node_id_t> eof_node_id_map; | |
2526 | unsigned int x = 0; | |
2527 | for (node_set::iterator node_id_it = eof_node_ids.begin(); | |
2528 | node_id_it != eof_node_ids.end(); | |
2529 | ++node_id_it) | |
2530 | { | |
2531 | eof_node_id_map[*node_id_it] = x++; | |
2532 | } | |
2533 | ||
2534 | // figure out if this is an acceptance state | |
2535 | node_id_t eof_node_id; | |
2536 | if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) | |
2537 | { | |
2538 | m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; | |
2539 | } | |
2540 | ||
2541 | std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(), | |
2542 | node_id_t(0)); | |
2543 | while (marked.end() != i) | |
2544 | { | |
2545 | *i = 1; | |
2546 | node_id_t T = node_id_t(std::distance(marked.begin(), i)); | |
2547 | BOOST_ASSERT(T < dstates2.size()); | |
2548 | node_set Tstates = dstates2[T]; | |
2549 | for (node_id_t j = 0; j < 256; ++j) | |
2550 | { | |
2551 | node_set U; | |
2552 | for (node_set::iterator k = Tstates.begin(); | |
2553 | k != Tstates.end(); ++k) | |
2554 | { | |
2555 | node_id_t p =*k; | |
2556 | BOOST_ASSERT(p < state_match.size()); | |
2557 | BOOST_ASSERT(j < state_match[p].size()); | |
2558 | if (state_match[p][j]) | |
2559 | { | |
2560 | node_set fpp = followpos[p]; | |
2561 | U.insert(fpp.begin(), fpp.end()); | |
2562 | } | |
2563 | } | |
2564 | if (U.size() > 0) | |
2565 | { | |
2566 | std::map<node_set, node_id_t>::iterator l = dstates1.find(U); | |
2567 | node_id_t target_state; | |
2568 | if (l == dstates1.end()) // not in the states yet | |
2569 | { | |
2570 | ++num_states; | |
2571 | dstates1[U] = target_state = num_states; | |
2572 | dstates2[target_state] = U; | |
2573 | marked.push_back(0); | |
2574 | m_dfa[state].transition_table.push_back( | |
2575 | std::vector<node_id_t>(256, invalid_node)); | |
2576 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2577 | // figure out if this is an acceptance state | |
2578 | node_id_t eof_node_id; | |
2579 | if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) | |
2580 | { | |
2581 | m_dfa[state].acceptance_index[target_state] = | |
2582 | eof_node_id_map[eof_node_id]; | |
2583 | } | |
2584 | } | |
2585 | else | |
2586 | { | |
2587 | target_state = dstates1[U]; | |
2588 | } | |
2589 | ||
2590 | BOOST_ASSERT(T < m_dfa[state].transition_table.size()); | |
2591 | BOOST_ASSERT(j < m_dfa[state].transition_table[T].size()); | |
2592 | m_dfa[state].transition_table[T][j] = target_state; | |
2593 | } | |
2594 | ||
2595 | } | |
2596 | ||
2597 | i = std::find(marked.begin(), marked.end(), node_id_t(0)); | |
2598 | } | |
2599 | m_compiled_dfa = true; | |
2600 | } | |
2601 | ||
2602 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2603 | inline void | |
2604 | lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive) | |
2605 | { | |
2606 | m_case_insensitive = insensitive; | |
2607 | } | |
2608 | ||
2609 | ||
2610 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2611 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2612 | inline void | |
2613 | lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out) | |
2614 | { | |
2615 | for (unsigned x = 0; x < m_dfa.size(); ++x) | |
2616 | { | |
2617 | out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; | |
2618 | for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) | |
2619 | { | |
2620 | out << "state " << i << ":"; | |
2621 | for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) | |
2622 | { | |
2623 | if (m_dfa[x].transition_table[i][j] != invalid_node) | |
2624 | out << j << "->" << m_dfa[x].transition_table[i][j] << " "; | |
2625 | } | |
2626 | out << "\n"; | |
2627 | } | |
2628 | out << "acceptance states: "; | |
2629 | for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) | |
2630 | { | |
2631 | if (m_dfa[x].acceptance_index[k] != invalid_node) | |
2632 | out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; | |
2633 | } | |
2634 | out << endl; | |
2635 | } | |
2636 | } | |
2637 | #endif | |
2638 | ||
2639 | /////////////////////////////////////////////////////////////////////////////// | |
2640 | // load the lexer tables | |
2641 | #define slex_in(strm, val) \ | |
2642 | strm.read((char*)&val, sizeof(val)); \ | |
2643 | if (std::ios::goodbit != strm.rdstate()) return false | |
2644 | ||
2645 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2646 | inline bool | |
2647 | lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id) | |
2648 | { | |
2649 | // ensure correct signature and version | |
2650 | long signature = 0; | |
2651 | ||
2652 | slex_in (in, signature); | |
2653 | if (signature != SLEX_SIGNATURE) | |
2654 | return false; // not for us | |
2655 | ||
2656 | long version = 0; | |
2657 | ||
2658 | slex_in (in, version); | |
2659 | if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) | |
2660 | return false; // to new for us | |
2661 | ||
2662 | long uid = 0; | |
2663 | ||
2664 | slex_in (in, uid); | |
2665 | if (uid != unique_id) | |
2666 | return false; // not saved by us | |
2667 | ||
2668 | // load auxiliary members | |
2669 | int num_states = 0; | |
2670 | ||
2671 | slex_in (in, num_states); | |
2672 | ||
2673 | // load the dfa tables | |
2674 | dfa_t in_dfa; | |
2675 | std::size_t dfa_size = 0; | |
2676 | ||
2677 | slex_in (in, dfa_size); | |
2678 | in_dfa.resize(dfa_size); | |
2679 | for (std::size_t dfa = 0; dfa < dfa_size; ++dfa) | |
2680 | { | |
2681 | // load the transition tables | |
2682 | std::size_t tt_size = 0; | |
2683 | transition_table_t &tt_table = in_dfa[dfa].transition_table; | |
2684 | ||
2685 | slex_in (in, tt_size); | |
2686 | tt_table.resize(tt_size); | |
2687 | for (std::size_t tt = 0; tt < tt_size; ++tt) | |
2688 | { | |
2689 | std::size_t nt_size = 0; | |
2690 | node_table_t &nt_table = tt_table[tt]; | |
2691 | ||
2692 | slex_in (in, nt_size); | |
2693 | nt_table.resize(nt_size); | |
2694 | for (std::size_t nt = 0; nt < nt_size; ++nt) | |
2695 | { | |
2696 | slex_in (in, nt_table[nt]); | |
2697 | } | |
2698 | } | |
2699 | ||
2700 | // load the acceptance index table | |
2701 | std::size_t ai_size = 0; | |
2702 | node_table_t &ai_table = in_dfa[dfa].acceptance_index; | |
2703 | ||
2704 | slex_in (in, ai_size); | |
2705 | ai_table.resize(ai_size); | |
2706 | for (std::size_t ai = 0; ai < ai_size; ++ai) | |
2707 | { | |
2708 | slex_in (in, ai_table[ai]); | |
2709 | } | |
2710 | } | |
2711 | ||
2712 | m_dfa.swap(in_dfa); // success, swap in the read values | |
2713 | m_num_states = num_states; | |
2714 | ||
2715 | m_compiled_dfa = true; | |
2716 | return true; | |
2717 | } | |
2718 | ||
2719 | #undef slex_in | |
2720 | ||
2721 | /////////////////////////////////////////////////////////////////////////////// | |
2722 | // save the lexer tables | |
2723 | #define slex_out(strm, val) \ | |
2724 | strm.write((char*)&val, sizeof(val)); \ | |
2725 | if (std::ios::goodbit != strm.rdstate()) return false | |
2726 | ||
2727 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2728 | inline bool | |
2729 | lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id) | |
2730 | { | |
2731 | // save signature and version information | |
2732 | long out_long = SLEX_SIGNATURE; | |
2733 | ||
2734 | slex_out(out, out_long); | |
2735 | out_long = SLEX_VERSION_100; | |
2736 | slex_out(out, out_long); | |
2737 | slex_out(out, unique_id); | |
2738 | ||
2739 | // save auxiliary members | |
2740 | slex_out(out, m_num_states); | |
2741 | ||
2742 | // save the dfa tables | |
2743 | typedef typename dfa_t::const_iterator dfa_iter_t; | |
2744 | typedef transition_table_t::const_iterator transition_table_iter_t; | |
2745 | typedef node_table_t::const_iterator node_table_iter_t; | |
2746 | ||
2747 | std::size_t out_size_t = m_dfa.size(); | |
2748 | slex_out(out, out_size_t); | |
2749 | ||
2750 | dfa_iter_t end = m_dfa.end(); | |
2751 | for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) | |
2752 | { | |
2753 | // save the transition table | |
2754 | out_size_t = (*it).transition_table.size(); | |
2755 | slex_out(out, out_size_t); | |
2756 | ||
2757 | transition_table_iter_t tt_end = (*it).transition_table.end(); | |
2758 | for (transition_table_iter_t tt_it = (*it).transition_table.begin(); | |
2759 | tt_it != tt_end; | |
2760 | ++tt_it) | |
2761 | { | |
2762 | out_size_t = (*tt_it).size(); | |
2763 | slex_out(out, out_size_t); | |
2764 | ||
2765 | node_table_iter_t nt_end = (*tt_it).end(); | |
2766 | for (node_table_iter_t nt_it = (*tt_it).begin(); | |
2767 | nt_it != nt_end; | |
2768 | ++nt_it) | |
2769 | { | |
2770 | slex_out(out, (*nt_it)); | |
2771 | } | |
2772 | } | |
2773 | ||
2774 | // save the acceptance index table | |
2775 | out_size_t = (*it).acceptance_index.size(); | |
2776 | slex_out(out, out_size_t); | |
2777 | ||
2778 | node_table_iter_t nt_end = (*it).acceptance_index.end(); | |
2779 | for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); | |
2780 | nt_it != nt_end; | |
2781 | ++nt_it) | |
2782 | { | |
2783 | slex_out(out, (*nt_it)); | |
2784 | } | |
2785 | } | |
2786 | return true; | |
2787 | } | |
2788 | ||
2789 | #undef slex_out | |
2790 | ||
2791 | /* | |
2792 | a lexer_control object supports some operations on the lexer. | |
2793 | get current lexer state | |
2794 | set state | |
2795 | terminate | |
2796 | state stack (push, pop, top) | |
2797 | set new token return value | |
2798 | ignore the current token | |
2799 | yymore | |
2800 | get length of matched token | |
2801 | */ | |
2802 | template <typename TokenT> | |
2803 | class lexer_control | |
2804 | { | |
2805 | public: | |
2806 | ||
2807 | lexer_control(TokenT& token, unsigned int& current_state, | |
2808 | std::stack<unsigned int>& state_stack); | |
2809 | // functions dealing with the lexer state | |
2810 | ||
2811 | // set the state to state | |
2812 | void set_state(unsigned int state); | |
2813 | ||
2814 | // get the current state | |
2815 | unsigned int get_state(); | |
2816 | ||
2817 | // pushes the current state onto the top of the state stack and | |
2818 | // switches to new_state | |
2819 | void push_state(unsigned int new_state); | |
2820 | ||
2821 | // pops the top of the state stack and switches to it. | |
2822 | void pop_state(); | |
2823 | ||
2824 | // returns the top of the stack without altering the stack's contents. | |
2825 | unsigned int top_state(); | |
2826 | ||
2827 | // functions dealing with the token returned. | |
2828 | ||
2829 | // set a new TokenT return value, overriding that one that was | |
2830 | // registered via. register_regex() | |
2831 | void set_token(const TokenT& token); | |
2832 | ||
2833 | // tell the lexer to return an invalid token, signifying termination. | |
2834 | void terminate(); | |
2835 | ||
2836 | // ignore the current token, and move on to the next one. The current | |
2837 | // token will NOT be returned from next_token() | |
2838 | void ignore_current_token(); | |
2839 | ||
2840 | // returns true if ignore_current_token() has been called, | |
2841 | // false otherwise. | |
2842 | bool ignore_current_token_set(); | |
2843 | ||
2844 | private: | |
2845 | TokenT& m_token; | |
2846 | bool m_ignore_current_token; | |
2847 | unsigned int& m_current_state; | |
2848 | std::stack<unsigned int>& m_state_stack; | |
2849 | }; | |
2850 | ||
2851 | template <typename TokenT> | |
2852 | inline | |
2853 | lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state, | |
2854 | std::stack<unsigned int>& state_stack) | |
2855 | : m_token(token) | |
2856 | , m_ignore_current_token(false) | |
2857 | , m_current_state(current_state) | |
2858 | , m_state_stack(state_stack) | |
2859 | { | |
2860 | } | |
2861 | ||
2862 | template <typename TokenT> | |
2863 | inline void | |
2864 | lexer_control<TokenT>::set_state(unsigned int state) | |
2865 | { | |
2866 | m_current_state = state; | |
2867 | } | |
2868 | ||
2869 | template <typename TokenT> | |
2870 | inline unsigned int | |
2871 | lexer_control<TokenT>::get_state() | |
2872 | { | |
2873 | return m_current_state; | |
2874 | } | |
2875 | ||
2876 | template <typename TokenT> | |
2877 | inline void | |
2878 | lexer_control<TokenT>::push_state(unsigned int new_state) | |
2879 | { | |
2880 | m_state_stack.push(m_current_state); | |
2881 | m_current_state = new_state; | |
2882 | } | |
2883 | ||
2884 | template <typename TokenT> | |
2885 | inline void | |
2886 | lexer_control<TokenT>::pop_state() | |
2887 | { | |
2888 | m_current_state = m_state_stack.top(); | |
2889 | m_state_stack.pop(); | |
2890 | } | |
2891 | ||
2892 | template <typename TokenT> | |
2893 | inline unsigned int | |
2894 | lexer_control<TokenT>::top_state() | |
2895 | { | |
2896 | return m_state_stack.top(); | |
2897 | } | |
2898 | ||
2899 | template <typename TokenT> | |
2900 | inline void | |
2901 | lexer_control<TokenT>::set_token(const TokenT& token) | |
2902 | { | |
2903 | m_token = token; | |
2904 | } | |
2905 | ||
2906 | template <typename TokenT> | |
2907 | inline void | |
2908 | lexer_control<TokenT>::terminate() | |
2909 | { | |
2910 | m_token = -1; // TOOD: fix this, need to be determined by traits | |
2911 | } | |
2912 | ||
2913 | template <typename TokenT> | |
2914 | inline void | |
2915 | lexer_control<TokenT>::ignore_current_token() | |
2916 | { | |
2917 | m_ignore_current_token = true; | |
2918 | } | |
2919 | ||
2920 | template <typename TokenT> | |
2921 | inline bool | |
2922 | lexer_control<TokenT>::ignore_current_token_set() | |
2923 | { | |
2924 | return m_ignore_current_token; | |
2925 | } | |
2926 | ||
2927 | } // namespace classic | |
2928 | } // namespace spirit | |
2929 | } // namespace boost | |
2930 | ||
2931 | #undef BOOST_SPIRIT_IT_NS | |
2932 | ||
2933 | #endif | |
2934 |