]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*============================================================================= |
2 | Boost.Wave: A Standard compliant C++ preprocessor library | |
3 | ||
4 | Spirit based lexer | |
5 | ||
6 | http://www.boost.org/ | |
7 | ||
8 | Copyright (c) 2001, Daniel C. Nuffer. | |
9 | Copyright (c) 2001-2012 Hartmut Kaiser. | |
10 | Distributed under the Boost Software License, Version 1.0. (See accompanying | |
11 | file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
12 | ||
13 | TODO List: | |
14 | X callback objects (called when a match is made.) | |
15 | X callback passed first & last iterator, and | |
16 | a reference to a lexercontrol object that supports some | |
17 | operations on the lexer. | |
18 | set state | |
19 | terminate | |
20 | state stack (push, pop, top) | |
21 | set new token return value | |
22 | ignore the current token | |
23 | yymore | |
24 | get length of matched token | |
25 | get current lexer state | |
26 | X DFA serialization to save recomputing the DFA. | |
27 | ||
28 | lexer states. | |
29 | organize the file into hpp and ipp. arrange the classes in a logical order. | |
30 | use buffering - iterator only needs be an input iterator, | |
31 | lexer & callback are not templatized on iterator type, but char type. | |
32 | next_token is templatized on iterator type. | |
33 | beginning/ending contexts. | |
34 | ^ and $ | |
35 | DFA minimization. | |
36 | DFA table compression. | |
37 | ||
38 | =============================================================================*/ | |
39 | #ifndef BOOST_SPIRIT_LEXER_HPP | |
40 | #define BOOST_SPIRIT_LEXER_HPP | |
41 | ||
42 | /////////////////////////////////////////////////////////////////////////////// | |
43 | #include <boost/throw_exception.hpp> | |
44 | ||
45 | #include <boost/spirit/include/classic_core.hpp> | |
46 | #include <boost/spirit/include/classic_symbols.hpp> | |
47 | #include <boost/spirit/include/classic_chset.hpp> | |
48 | #include <boost/spirit/include/classic_escape_char.hpp> | |
49 | ||
50 | #include <set> | |
51 | #include <map> | |
52 | #include <vector> | |
53 | #include <stack> | |
54 | #include <utility> // for pair | |
55 | #include <iostream> | |
56 | #include <fstream> | |
57 | #include <boost/assert.hpp> | |
58 | #include <boost/limits.hpp> | |
59 | ||
60 | #if defined(BOOST_NO_STD_ITERATOR_TRAITS) | |
61 | #define BOOST_SPIRIT_IT_NS impl | |
62 | #else | |
63 | #define BOOST_SPIRIT_IT_NS std | |
64 | #endif | |
65 | ||
66 | /////////////////////////////////////////////////////////////////////////////// | |
67 | namespace boost { | |
68 | namespace spirit { | |
69 | namespace classic { | |
70 | ||
71 | typedef unsigned char uchar; | |
72 | typedef unsigned int node_id_t; | |
73 | const node_id_t invalid_node = node_id_t(-1); | |
74 | typedef std::set<node_id_t> node_set; | |
75 | typedef std::vector<uchar> uchar_vector; | |
76 | typedef std::map<node_id_t, node_set> followpos_t; | |
77 | typedef std::vector<uchar_vector> state_match_t; | |
78 | ||
79 | template <typename TokenT> | |
80 | class lexer_control; | |
81 | ||
82 | class bad_regex : public std::exception | |
83 | { | |
84 | }; | |
85 | ||
86 | namespace lexerimpl | |
87 | { | |
88 | ||
89 | class node | |
90 | { | |
91 | public: | |
92 | ||
93 | virtual ~node() {} | |
94 | ||
95 | virtual node* clone() const = 0; | |
96 | virtual bool nullable() const = 0; | |
97 | virtual node_set firstpos() const = 0; | |
98 | virtual node_set lastpos() const = 0; | |
99 | virtual void compute_followpos(followpos_t& followpos) const = 0; | |
100 | virtual void compute_state_match(state_match_t& state_match) const = 0; | |
101 | virtual void get_eof_ids(node_set& eof_set) const = 0; | |
102 | virtual void assign_node_ids(node_id_t& node_count) = 0; | |
103 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
104 | virtual void dump(std::ostream& out) const = 0; | |
105 | #endif | |
106 | ||
107 | }; | |
108 | ||
109 | class char_node : public node | |
110 | { | |
111 | ||
112 | public: | |
113 | ||
114 | char_node(const uchar c); | |
115 | char_node(const char_node& x); | |
116 | virtual ~char_node(){} | |
117 | ||
118 | virtual node* clone() const; | |
119 | virtual bool nullable() const; | |
120 | virtual node_set firstpos() const; | |
121 | virtual node_set lastpos() const; | |
122 | virtual void compute_followpos(followpos_t& followpos) const; | |
123 | virtual void compute_state_match(state_match_t& state_match ) const; | |
124 | virtual void get_eof_ids(node_set& eof_set) const; | |
125 | virtual void assign_node_ids(node_id_t& node_count); | |
126 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
127 | virtual void dump(std::ostream& out) const; | |
128 | #endif | |
129 | ||
130 | private: | |
131 | ||
132 | uchar m_char; | |
133 | node_id_t m_node_num; | |
134 | }; | |
135 | ||
136 | inline | |
137 | char_node::char_node(const uchar c) | |
138 | : node() | |
139 | , m_char(c) | |
140 | , m_node_num(0) | |
141 | { | |
142 | } | |
143 | ||
144 | inline | |
145 | char_node::char_node(const char_node& x) | |
146 | : node(x) | |
147 | , m_char(x.m_char) | |
148 | , m_node_num(x.m_node_num) | |
149 | { | |
150 | } | |
151 | ||
152 | inline node * | |
153 | char_node::clone() const | |
154 | { | |
155 | return new char_node(*this); | |
156 | } | |
157 | ||
158 | inline bool | |
159 | char_node::nullable() const | |
160 | { | |
161 | return false; | |
162 | } | |
163 | ||
164 | inline node_set | |
165 | char_node::firstpos() const | |
166 | { | |
167 | node_set rval; | |
168 | rval.insert(m_node_num); | |
169 | return rval; | |
170 | } | |
171 | ||
172 | inline node_set | |
173 | char_node::lastpos() const | |
174 | { | |
175 | return firstpos(); | |
176 | } | |
177 | ||
178 | inline void | |
179 | char_node::compute_followpos(followpos_t&) const | |
180 | { | |
181 | return; | |
182 | } | |
183 | ||
184 | inline void | |
185 | char_node::compute_state_match(state_match_t& state_match) const | |
186 | { | |
187 | if (state_match.size() < m_node_num + 1) | |
188 | state_match.resize(m_node_num + 1); | |
189 | state_match[m_node_num].resize(256); | |
190 | state_match[m_node_num][m_char] = 1; | |
191 | } | |
192 | ||
193 | inline void | |
194 | char_node::get_eof_ids(node_set&) const | |
195 | { | |
196 | return; | |
197 | } | |
198 | ||
199 | inline void | |
200 | char_node::assign_node_ids(node_id_t& node_count) | |
201 | { | |
202 | m_node_num = node_count++; | |
203 | } | |
204 | ||
205 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
206 | inline void | |
207 | char_node::dump(std::ostream& out) const | |
208 | { | |
209 | out << "\nchar_node m_char = " << m_char; | |
210 | out << " m_node_num = " << m_node_num; | |
211 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
212 | out << " firstpos() = "; | |
213 | node_set fp = firstpos(); | |
214 | std::copy(fp.begin(), fp.end(), | |
215 | std::ostream_iterator<node_id_t>(out, ",")); | |
216 | ||
217 | out << " lastpos() = "; | |
218 | node_set lp = lastpos(); | |
219 | std::copy(lp.begin(), lp.end(), | |
220 | std::ostream_iterator<node_id_t>(out, ",")); | |
221 | } | |
222 | #endif | |
223 | ||
224 | ||
225 | class epsilon_node : public node | |
226 | { | |
227 | ||
228 | public: | |
229 | ||
230 | epsilon_node(); | |
231 | epsilon_node(const epsilon_node& x); | |
232 | virtual ~epsilon_node(){} | |
233 | ||
234 | virtual node* clone() const; | |
235 | virtual bool nullable() const; | |
236 | virtual node_set firstpos() const; | |
237 | virtual node_set lastpos() const; | |
238 | virtual void compute_followpos(followpos_t& followpos) const; | |
239 | virtual void compute_state_match(state_match_t& state_match ) const; | |
240 | virtual void get_eof_ids(node_set& eof_set) const; | |
241 | virtual void assign_node_ids(node_id_t& node_count); | |
242 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
243 | virtual void dump(std::ostream& out) const; | |
244 | #endif | |
245 | ||
246 | private: | |
247 | ||
248 | node_id_t m_node_num; | |
249 | }; | |
250 | ||
251 | inline | |
252 | epsilon_node::epsilon_node() | |
253 | : node() | |
254 | , m_node_num(0) | |
255 | { | |
256 | } | |
257 | ||
258 | inline | |
259 | epsilon_node::epsilon_node(const epsilon_node& x) | |
260 | : node(x) | |
261 | , m_node_num(x.m_node_num) | |
262 | { | |
263 | } | |
264 | ||
265 | inline node * | |
266 | epsilon_node::clone() const | |
267 | { | |
268 | return new epsilon_node(*this); | |
269 | } | |
270 | ||
271 | inline bool | |
272 | epsilon_node::nullable() const | |
273 | { | |
274 | return true; | |
275 | } | |
276 | ||
277 | inline node_set | |
278 | epsilon_node::firstpos() const | |
279 | { | |
280 | return node_set(); | |
281 | } | |
282 | ||
283 | inline node_set | |
284 | epsilon_node::lastpos() const | |
285 | { | |
286 | return node_set(); | |
287 | } | |
288 | ||
289 | inline void | |
290 | epsilon_node::compute_followpos(followpos_t&) const | |
291 | { | |
292 | return; | |
293 | } | |
294 | ||
295 | inline void | |
296 | epsilon_node::compute_state_match(state_match_t& state_match) const | |
297 | { | |
298 | if (state_match.size() < m_node_num + 1) | |
299 | state_match.resize(m_node_num + 1); | |
300 | state_match[m_node_num].resize(256, 1); | |
301 | } | |
302 | ||
303 | inline void | |
304 | epsilon_node::get_eof_ids(node_set&) const | |
305 | { | |
306 | return; | |
307 | } | |
308 | ||
309 | inline void | |
310 | epsilon_node::assign_node_ids(node_id_t& node_count) | |
311 | { | |
312 | m_node_num = node_count++; | |
313 | } | |
314 | ||
315 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
316 | inline void | |
317 | epsilon_node::dump(std::ostream& out) const | |
318 | { | |
319 | out << "\nepsilon_node"; | |
320 | out << " m_node_num = " << m_node_num; | |
321 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
322 | out << " firstpos() = "; | |
323 | node_set fp = firstpos(); | |
324 | std::copy(fp.begin(), fp.end(), | |
325 | std::ostream_iterator<node_id_t>(out, ",")); | |
326 | ||
327 | out << " lastpos() = "; | |
328 | node_set lp = lastpos(); | |
329 | std::copy(lp.begin(), lp.end(), | |
330 | std::ostream_iterator<node_id_t>(out, ",")); | |
331 | } | |
332 | #endif | |
333 | ||
334 | ||
335 | class or_node : public node | |
336 | { | |
337 | ||
338 | public: | |
339 | ||
340 | or_node(node* left, node* right); | |
341 | or_node(const or_node& x); | |
342 | virtual ~or_node(){} | |
343 | ||
344 | virtual node* clone() const; | |
345 | virtual bool nullable() const; | |
346 | virtual node_set firstpos() const; | |
347 | virtual node_set lastpos() const; | |
348 | virtual void compute_followpos(followpos_t& followpos) const; | |
349 | virtual void compute_state_match(state_match_t& state_match ) const; | |
350 | virtual void get_eof_ids(node_set& eof_set) const; | |
351 | virtual void assign_node_ids(node_id_t& node_count); | |
352 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
353 | virtual void dump(std::ostream& out) const; | |
354 | #endif | |
355 | ||
356 | private: | |
357 | ||
b32b8144 FG |
358 | #ifdef BOOST_NO_AUTO_PTR |
359 | std::unique_ptr<node> m_left; | |
360 | std::unique_ptr<node> m_right; | |
361 | #else | |
7c673cae FG |
362 | std::auto_ptr<node> m_left; |
363 | std::auto_ptr<node> m_right; | |
b32b8144 | 364 | #endif |
7c673cae FG |
365 | }; |
366 | ||
367 | inline | |
368 | or_node::or_node(node* left, node* right) | |
369 | : node() | |
370 | , m_left(left) | |
371 | , m_right(right) | |
372 | { | |
373 | } | |
374 | ||
375 | inline | |
376 | or_node::or_node(const or_node& x) | |
377 | : node(x) | |
378 | , m_left(x.m_left->clone()) | |
379 | , m_right(x.m_right->clone()) | |
380 | { | |
381 | } | |
382 | ||
383 | inline node * | |
384 | or_node::clone() const | |
385 | { | |
386 | return new or_node(m_left->clone(), m_right->clone()); | |
387 | } | |
388 | ||
389 | inline bool | |
390 | or_node::nullable() const | |
391 | { | |
392 | return m_left->nullable() || m_right->nullable(); | |
393 | } | |
394 | ||
395 | inline node_set | |
396 | or_node::firstpos() const | |
397 | { | |
398 | node_set rval; | |
399 | node_set l = m_left->firstpos(); | |
400 | node_set r = m_right->firstpos(); | |
401 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
402 | std::inserter(rval, rval.begin())); | |
403 | return rval; | |
404 | } | |
405 | ||
406 | inline node_set | |
407 | or_node::lastpos() const | |
408 | { | |
409 | node_set rval; | |
410 | node_set l = m_left->lastpos(); | |
411 | node_set r = m_right->lastpos(); | |
412 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
413 | std::inserter(rval, rval.begin())); | |
414 | return rval; | |
415 | } | |
416 | ||
417 | inline void | |
418 | or_node::compute_followpos(followpos_t& followpos) const | |
419 | { | |
420 | m_left->compute_followpos(followpos); | |
421 | m_right->compute_followpos(followpos); | |
422 | } | |
423 | ||
424 | inline void | |
425 | or_node::compute_state_match(state_match_t& state_match) const | |
426 | { | |
427 | m_left->compute_state_match(state_match); | |
428 | m_right->compute_state_match(state_match); | |
429 | } | |
430 | ||
431 | inline void | |
432 | or_node::get_eof_ids(node_set& eof_nodes) const | |
433 | { | |
434 | m_left->get_eof_ids(eof_nodes); | |
435 | m_right->get_eof_ids(eof_nodes); | |
436 | } | |
437 | ||
438 | inline void | |
439 | or_node::assign_node_ids(node_id_t& node_count) | |
440 | { | |
441 | m_left->assign_node_ids(node_count); | |
442 | m_right->assign_node_ids(node_count); | |
443 | } | |
444 | ||
445 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
446 | inline void | |
447 | or_node::dump(std::ostream& out) const | |
448 | { | |
449 | m_left->dump(out); | |
450 | ||
451 | out << "\nor_node"; | |
452 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
453 | out << " firstpos() = "; | |
454 | node_set fp = firstpos(); | |
455 | std::copy(fp.begin(), fp.end(), | |
456 | std::ostream_iterator<node_id_t>(out, ",")); | |
457 | ||
458 | out << " lastpos() = "; | |
459 | node_set lp = lastpos(); | |
460 | std::copy(lp.begin(), lp.end(), | |
461 | std::ostream_iterator<node_id_t>(out, ",")); | |
462 | ||
463 | m_right->dump(out); | |
464 | } | |
465 | #endif | |
466 | ||
467 | ||
468 | class cat_node : public node | |
469 | { | |
470 | ||
471 | public: | |
472 | ||
473 | cat_node(node* left, node* right); | |
474 | cat_node(const cat_node& x); | |
475 | virtual ~cat_node(){} | |
476 | ||
477 | virtual node* clone() const; | |
478 | virtual bool nullable() const; | |
479 | virtual node_set firstpos() const; | |
480 | virtual node_set lastpos() const; | |
481 | virtual void compute_followpos(followpos_t& followpos) const; | |
482 | virtual void compute_state_match(state_match_t& state_match ) const; | |
483 | virtual void get_eof_ids(node_set& eof_set) const; | |
484 | virtual void assign_node_ids(node_id_t& node_count); | |
485 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
486 | virtual void dump(std::ostream& out) const; | |
487 | #endif | |
488 | ||
489 | private: | |
490 | ||
b32b8144 FG |
491 | #ifdef BOOST_NO_AUTO_PTR |
492 | std::unique_ptr<node> m_left; | |
493 | std::unique_ptr<node> m_right; | |
494 | #else | |
7c673cae FG |
495 | std::auto_ptr<node> m_left; |
496 | std::auto_ptr<node> m_right; | |
b32b8144 | 497 | #endif |
7c673cae FG |
498 | }; |
499 | ||
500 | inline | |
501 | cat_node::cat_node(node* left, node* right) | |
502 | : node() | |
503 | , m_left(left) | |
504 | , m_right(right) | |
505 | { | |
506 | } | |
507 | ||
508 | inline | |
509 | cat_node::cat_node(const cat_node& x) | |
510 | : node(x) | |
511 | , m_left(x.m_left->clone()) | |
512 | , m_right(x.m_right->clone()) | |
513 | { | |
514 | } | |
515 | ||
516 | inline node * | |
517 | cat_node::clone() const | |
518 | { | |
519 | return new cat_node(m_left->clone(), m_right->clone()); | |
520 | } | |
521 | ||
522 | inline bool | |
523 | cat_node::nullable() const | |
524 | { | |
525 | return m_left->nullable() && m_right->nullable(); | |
526 | } | |
527 | ||
528 | inline node_set | |
529 | cat_node::firstpos() const | |
530 | { | |
531 | if (m_left->nullable()) | |
532 | { | |
533 | node_set rval; | |
534 | node_set l = m_left->firstpos(); | |
535 | node_set r = m_right->firstpos(); | |
536 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
537 | std::inserter(rval, rval.begin())); | |
538 | return rval; | |
539 | } | |
540 | else | |
541 | { | |
542 | return m_left->firstpos(); | |
543 | } | |
544 | } | |
545 | ||
546 | inline node_set | |
547 | cat_node::lastpos() const | |
548 | { | |
549 | if (m_right->nullable()) | |
550 | { | |
551 | node_set rval; | |
552 | node_set l = m_left->lastpos(); | |
553 | node_set r = m_right->lastpos(); | |
554 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
555 | std::inserter(rval, rval.begin())); | |
556 | return rval; | |
557 | } | |
558 | else | |
559 | { | |
560 | return m_right->lastpos(); | |
561 | } | |
562 | } | |
563 | ||
564 | inline void | |
565 | cat_node::compute_followpos(followpos_t& followpos) const | |
566 | { | |
567 | node_set l = m_left->lastpos(); | |
568 | for (node_set::iterator i = l.begin(); | |
569 | i != l.end(); | |
570 | ++i) | |
571 | { | |
572 | node_set rf = m_right->firstpos(); | |
573 | followpos[*i].insert(rf.begin(), rf.end()); | |
574 | } | |
575 | ||
576 | m_left->compute_followpos(followpos); | |
577 | m_right->compute_followpos(followpos); | |
578 | } | |
579 | ||
580 | inline void | |
581 | cat_node::compute_state_match(state_match_t& state_match) const | |
582 | { | |
583 | m_left->compute_state_match(state_match); | |
584 | m_right->compute_state_match(state_match); | |
585 | } | |
586 | ||
587 | inline void | |
588 | cat_node::get_eof_ids(node_set& eof_nodes) const | |
589 | { | |
590 | m_left->get_eof_ids(eof_nodes); | |
591 | m_right->get_eof_ids(eof_nodes); | |
592 | } | |
593 | ||
594 | inline void | |
595 | cat_node::assign_node_ids(node_id_t& node_count) | |
596 | { | |
597 | m_left->assign_node_ids(node_count); | |
598 | m_right->assign_node_ids(node_count); | |
599 | } | |
600 | ||
601 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
602 | inline void | |
603 | cat_node::dump(std::ostream& out) const | |
604 | { | |
605 | m_left->dump(out); | |
606 | ||
607 | out << "\ncat_node"; | |
608 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
609 | out << " firstpos() = "; | |
610 | node_set fp = firstpos(); | |
611 | std::copy(fp.begin(), fp.end(), | |
612 | std::ostream_iterator<node_id_t>(out, ",")); | |
613 | ||
614 | out << " lastpos() = "; | |
615 | node_set lp = lastpos(); | |
616 | std::copy(lp.begin(), lp.end(), | |
617 | std::ostream_iterator<node_id_t>(out, ",")); | |
618 | ||
619 | m_right->dump(out); | |
620 | } | |
621 | #endif | |
622 | ||
623 | ||
624 | class star_node : public node | |
625 | { | |
626 | ||
627 | public: | |
628 | ||
629 | star_node(node* left); | |
630 | star_node(const star_node& x); | |
631 | virtual ~star_node(){} | |
632 | ||
633 | virtual node* clone() const; | |
634 | virtual bool nullable() const; | |
635 | virtual node_set firstpos() const; | |
636 | virtual node_set lastpos() const; | |
637 | virtual void compute_followpos(followpos_t& followpos) const; | |
638 | virtual void compute_state_match(state_match_t& state_match ) const; | |
639 | virtual void get_eof_ids(node_set& eof_set) const; | |
640 | virtual void assign_node_ids(node_id_t& node_count); | |
641 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
642 | virtual void dump(std::ostream& out) const; | |
643 | #endif | |
644 | ||
645 | private: | |
646 | ||
b32b8144 FG |
647 | #ifdef BOOST_NO_AUTO_PTR |
648 | std::unique_ptr<node> m_left; | |
649 | #else | |
7c673cae | 650 | std::auto_ptr<node> m_left; |
b32b8144 | 651 | #endif |
7c673cae FG |
652 | }; |
653 | ||
654 | inline | |
655 | star_node::star_node(node* left) | |
656 | : node() | |
657 | , m_left(left) | |
658 | { | |
659 | } | |
660 | ||
661 | inline | |
662 | star_node::star_node(const star_node& x) | |
663 | : node(x) | |
664 | , m_left(x.m_left->clone()) | |
665 | { | |
666 | } | |
667 | ||
668 | inline node * | |
669 | star_node::clone() const | |
670 | { | |
671 | return new star_node(m_left->clone()); | |
672 | } | |
673 | ||
674 | inline bool | |
675 | star_node::nullable() const | |
676 | { | |
677 | return true; | |
678 | } | |
679 | ||
680 | inline node_set | |
681 | star_node::firstpos() const | |
682 | { | |
683 | return m_left->firstpos(); | |
684 | } | |
685 | ||
686 | inline node_set | |
687 | star_node::lastpos() const | |
688 | { | |
689 | return m_left->lastpos(); | |
690 | } | |
691 | ||
692 | inline void | |
693 | star_node::compute_followpos(followpos_t& followpos) const | |
694 | { | |
695 | node_set lp = this->lastpos(); | |
696 | for (node_set::iterator i = lp.begin(); | |
697 | i != lp.end(); | |
698 | ++i) | |
699 | { | |
700 | node_set fp = this->firstpos(); | |
701 | followpos[*i].insert(fp.begin(), fp.end()); | |
702 | } | |
703 | ||
704 | m_left->compute_followpos(followpos); | |
705 | } | |
706 | ||
707 | inline void | |
708 | star_node::compute_state_match(state_match_t& state_match) const | |
709 | { | |
710 | m_left->compute_state_match(state_match); | |
711 | } | |
712 | ||
713 | inline void | |
714 | star_node::get_eof_ids(node_set& eof_nodes) const | |
715 | { | |
716 | m_left->get_eof_ids(eof_nodes); | |
717 | } | |
718 | ||
719 | inline void | |
720 | star_node::assign_node_ids(node_id_t& node_count) | |
721 | { | |
722 | m_left->assign_node_ids(node_count); | |
723 | } | |
724 | ||
725 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
726 | inline void | |
727 | star_node::dump(std::ostream& out) const | |
728 | { | |
729 | m_left->dump(out); | |
730 | ||
731 | out << "\nstar_node"; | |
732 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
733 | out << " firstpos() = "; | |
734 | node_set fp = firstpos(); | |
735 | std::copy(fp.begin(), fp.end(), | |
736 | std::ostream_iterator<node_id_t>(out, ",")); | |
737 | ||
738 | out << " lastpos() = "; | |
739 | node_set lp = lastpos(); | |
740 | std::copy(lp.begin(), lp.end(), | |
741 | std::ostream_iterator<node_id_t>(out, ",")); | |
742 | ||
743 | } | |
744 | #endif | |
745 | ||
746 | ||
747 | class eof_node : public node | |
748 | { | |
749 | ||
750 | public: | |
751 | ||
752 | eof_node(); | |
753 | eof_node(const eof_node& x); | |
754 | virtual ~eof_node(){} | |
755 | ||
756 | virtual node* clone() const; | |
757 | virtual bool nullable() const; | |
758 | virtual node_set firstpos() const; | |
759 | virtual node_set lastpos() const; | |
760 | virtual void compute_followpos(followpos_t& followpos) const; | |
761 | virtual void compute_state_match(state_match_t& state_match ) const; | |
762 | virtual void get_eof_ids(node_set& eof_set) const; | |
763 | virtual void assign_node_ids(node_id_t& node_count); | |
764 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
765 | virtual void dump(std::ostream& out) const; | |
766 | #endif | |
767 | ||
768 | private: | |
769 | ||
770 | node_id_t m_node_num; | |
771 | }; | |
772 | ||
773 | inline | |
774 | eof_node::eof_node() | |
775 | : node() | |
776 | , m_node_num(0) | |
777 | { | |
778 | } | |
779 | ||
780 | inline | |
781 | eof_node::eof_node(const eof_node& x) | |
782 | : node(x) | |
783 | , m_node_num(x.m_node_num) | |
784 | { | |
785 | } | |
786 | ||
787 | inline node * | |
788 | eof_node::clone() const | |
789 | { | |
790 | return new eof_node(*this); | |
791 | } | |
792 | ||
793 | inline bool | |
794 | eof_node::nullable() const | |
795 | { | |
796 | return false; | |
797 | } | |
798 | ||
799 | inline node_set | |
800 | eof_node::firstpos() const | |
801 | { | |
802 | node_set rval; | |
803 | rval.insert(m_node_num); | |
804 | return rval; | |
805 | } | |
806 | ||
807 | inline node_set | |
808 | eof_node::lastpos() const | |
809 | { | |
810 | node_set rval; | |
811 | rval.insert(m_node_num); | |
812 | return rval; | |
813 | } | |
814 | ||
815 | inline void | |
816 | eof_node::compute_followpos(followpos_t&) const | |
817 | { | |
818 | return; | |
819 | } | |
820 | ||
821 | inline void | |
822 | eof_node::compute_state_match(state_match_t& state_match) const | |
823 | { | |
824 | if (state_match.size() < m_node_num + 1) | |
825 | state_match.resize(m_node_num + 1); | |
826 | state_match[m_node_num].resize(256, 0); | |
827 | } | |
828 | ||
829 | inline void | |
830 | eof_node::get_eof_ids(node_set& eof_nodes) const | |
831 | { | |
832 | eof_nodes.insert(m_node_num); | |
833 | } | |
834 | ||
835 | inline void | |
836 | eof_node::assign_node_ids(node_id_t& node_count) | |
837 | { | |
838 | m_node_num = node_count++; | |
839 | } | |
840 | ||
841 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
842 | inline void | |
843 | eof_node::dump(std::ostream& out) const | |
844 | { | |
845 | out << "\neof_node"; | |
846 | out << " m_node_num = " << m_node_num; | |
847 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
848 | out << " firstpos() = "; | |
849 | node_set fp = firstpos(); | |
850 | std::copy(fp.begin(), fp.end(), | |
851 | std::ostream_iterator<node_id_t>(out, ",")); | |
852 | ||
853 | out << " lastpos() = "; | |
854 | node_set lp = lastpos(); | |
855 | std::copy(lp.begin(), lp.end(), | |
856 | std::ostream_iterator<node_id_t>(out, ",")); | |
857 | } | |
858 | #endif | |
859 | ||
860 | class ccl_node : public node | |
861 | { | |
862 | ||
863 | public: | |
864 | ||
865 | ccl_node(const std::vector<uchar>& v); | |
866 | ccl_node(const uchar c1, const uchar c2); | |
867 | ccl_node(const ccl_node& x); | |
868 | virtual ~ccl_node(){} | |
869 | ||
870 | virtual node* clone() const; | |
871 | virtual bool nullable() const; | |
872 | virtual node_set firstpos() const; | |
873 | virtual node_set lastpos() const; | |
874 | virtual void compute_followpos(followpos_t& followpos) const; | |
875 | virtual void compute_state_match(state_match_t& state_match ) const; | |
876 | virtual void get_eof_ids(node_set& eof_set) const; | |
877 | virtual void assign_node_ids(node_id_t& node_count); | |
878 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
879 | virtual void dump(std::ostream& out) const; | |
880 | #endif | |
881 | ||
882 | private: | |
883 | ||
884 | std::vector<uchar> m_match; | |
885 | node_id_t m_node_num; | |
886 | }; | |
887 | ||
888 | inline | |
889 | ccl_node::ccl_node(const std::vector<uchar>& v) | |
890 | : node() | |
891 | , m_match(v) | |
892 | , m_node_num(0) | |
893 | { | |
894 | m_match.resize(256); // make sure it's the right size | |
895 | } | |
896 | ||
897 | inline | |
898 | ccl_node::ccl_node(const uchar c1, const uchar c2) | |
899 | : node() | |
900 | , m_match(256, uchar(0)) | |
901 | , m_node_num(0) | |
902 | { | |
903 | BOOST_ASSERT(c1 < c2); | |
904 | for (std::size_t i = c1; i <= std::size_t(c2); ++i) | |
905 | { | |
906 | m_match[i] = 1; | |
907 | } | |
908 | } | |
909 | ||
910 | inline | |
911 | ccl_node::ccl_node(const ccl_node& x) | |
912 | : node(x) | |
913 | , m_match(x.m_match) | |
914 | , m_node_num(x.m_node_num) | |
915 | { | |
916 | } | |
917 | ||
918 | inline node * | |
919 | ccl_node::clone() const | |
920 | { | |
921 | return new ccl_node(*this); | |
922 | } | |
923 | ||
924 | inline bool | |
925 | ccl_node::nullable() const | |
926 | { | |
927 | return false; | |
928 | } | |
929 | ||
930 | inline node_set | |
931 | ccl_node::firstpos() const | |
932 | { | |
933 | node_set rval; | |
934 | rval.insert(m_node_num); | |
935 | return rval; | |
936 | } | |
937 | ||
938 | inline node_set | |
939 | ccl_node::lastpos() const | |
940 | { | |
941 | return firstpos(); | |
942 | } | |
943 | ||
944 | inline void | |
945 | ccl_node::compute_followpos(followpos_t&) const | |
946 | { | |
947 | return; | |
948 | } | |
949 | ||
950 | inline void | |
951 | ccl_node::compute_state_match(state_match_t& state_match) const | |
952 | { | |
953 | if (state_match.size() < m_node_num + 1) | |
954 | state_match.resize(m_node_num + 1); | |
955 | state_match[m_node_num] = m_match; | |
956 | } | |
957 | ||
958 | inline void | |
959 | ccl_node::get_eof_ids(node_set&) const | |
960 | { | |
961 | return; | |
962 | } | |
963 | ||
964 | inline void | |
965 | ccl_node::assign_node_ids(node_id_t& node_count) | |
966 | { | |
967 | m_node_num = node_count++; | |
968 | } | |
969 | ||
970 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
971 | inline void | |
972 | ccl_node::dump(std::ostream& out) const | |
973 | { | |
974 | out << "\nccl_node m_match = "; | |
975 | for (std::size_t i = 0; i < m_match.size(); ++i) | |
976 | { | |
977 | if (m_match[i]) | |
978 | out << i << ", "; | |
979 | } | |
980 | out << " m_node_num = " << m_node_num; | |
981 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
982 | out << " firstpos() = "; | |
983 | node_set fp = firstpos(); | |
984 | std::copy(fp.begin(), fp.end(), | |
985 | std::ostream_iterator<node_id_t>(out, ",")); | |
986 | ||
987 | out << " lastpos() = "; | |
988 | node_set lp = lastpos(); | |
989 | std::copy(lp.begin(), lp.end(), | |
990 | std::ostream_iterator<node_id_t>(out, ",")); | |
991 | } | |
992 | #endif | |
993 | ||
994 | template <typename ScannerT> | |
995 | class make_concat | |
996 | { | |
997 | typedef typename ScannerT::iterator_t iterator_type; | |
998 | ||
999 | public: | |
1000 | ||
1001 | make_concat(std::stack<node*>& the_stack) | |
1002 | : m_stack(the_stack) | |
1003 | {} | |
1004 | ||
1005 | void operator()(iterator_type const &, iterator_type const &) const | |
1006 | { | |
1007 | node* right = m_stack.top(); | |
1008 | m_stack.pop(); | |
1009 | node* left = m_stack.top(); | |
1010 | m_stack.pop(); | |
1011 | node* newnode = new cat_node(left, right); | |
1012 | m_stack.push(newnode); | |
1013 | } | |
1014 | ||
1015 | std::stack<node*>& m_stack; | |
1016 | }; | |
1017 | ||
1018 | template <int CharTSize> | |
1019 | struct get_byte_aux; | |
1020 | ||
1021 | template<> | |
1022 | struct get_byte_aux<1> | |
1023 | { | |
1024 | template <typename CharT> | |
1025 | unsigned char operator()(CharT c, unsigned int byte) | |
1026 | { | |
1027 | BOOST_ASSERT(byte == 0); | |
1028 | return c; | |
1029 | } | |
1030 | }; | |
1031 | ||
1032 | template<> | |
1033 | struct get_byte_aux<2> | |
1034 | { | |
1035 | template <typename CharT> | |
1036 | unsigned char operator()(CharT c, unsigned int byte) | |
1037 | { | |
1038 | static unsigned long mask[] = | |
1039 | { | |
1040 | 0xFF00, | |
1041 | 0x00FF | |
1042 | }; | |
1043 | ||
1044 | BOOST_ASSERT(byte < 2); | |
1045 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1046 | } | |
1047 | }; | |
1048 | ||
1049 | template<> | |
1050 | struct get_byte_aux<4> | |
1051 | { | |
1052 | template <typename CharT> | |
1053 | unsigned char operator()(CharT c, unsigned int byte) | |
1054 | { | |
1055 | static unsigned long mask[] = | |
1056 | { | |
1057 | 0xFF000000, | |
1058 | 0x00FF0000, | |
1059 | 0x0000FF00, | |
1060 | 0x000000FF | |
1061 | }; | |
1062 | ||
1063 | BOOST_ASSERT(byte < 4); | |
1064 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1065 | } | |
1066 | }; | |
1067 | ||
1068 | template <typename CharT> | |
1069 | inline unsigned char | |
1070 | get_byte(CharT c, unsigned int byte) | |
1071 | { | |
1072 | return get_byte_aux<sizeof(c)>()(c, byte); | |
1073 | } | |
1074 | ||
1075 | template <typename ScannerT> | |
1076 | class make_star | |
1077 | { | |
1078 | typedef typename ScannerT::iterator_t iterator_type; | |
1079 | ||
1080 | public: | |
1081 | typedef | |
1082 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1083 | char_t; | |
1084 | ||
1085 | make_star(std::stack<node*>& the_stack) | |
1086 | : m_stack(the_stack) | |
1087 | {} | |
1088 | ||
1089 | void operator()(const char_t) const | |
1090 | { | |
1091 | node* left = m_stack.top(); | |
1092 | m_stack.pop(); | |
1093 | node* newnode = new star_node(left); | |
1094 | m_stack.push(newnode); | |
1095 | } | |
1096 | ||
1097 | std::stack<node*>& m_stack; | |
1098 | }; | |
1099 | ||
1100 | template <typename ScannerT> | |
1101 | class make_or | |
1102 | { | |
1103 | typedef typename ScannerT::iterator_t iterator_type; | |
1104 | ||
1105 | public: | |
1106 | ||
1107 | make_or(std::stack<node*>& the_stack) | |
1108 | : m_stack(the_stack) | |
1109 | {} | |
1110 | ||
1111 | void operator()(iterator_type const&, iterator_type const&) const | |
1112 | { | |
1113 | node* right = m_stack.top(); | |
1114 | m_stack.pop(); | |
1115 | node* left = m_stack.top(); | |
1116 | m_stack.pop(); | |
1117 | node* newnode = new or_node(left, right); | |
1118 | m_stack.push(newnode); | |
1119 | } | |
1120 | ||
1121 | std::stack<node*>& m_stack; | |
1122 | }; | |
1123 | ||
1124 | template <typename ScannerT> | |
1125 | class make_plus | |
1126 | { | |
1127 | typedef typename ScannerT::iterator_t iterator_type; | |
1128 | ||
1129 | public: | |
1130 | typedef | |
1131 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1132 | char_t; | |
1133 | ||
1134 | make_plus(std::stack<node*>& the_stack) | |
1135 | : m_stack(the_stack) | |
1136 | {} | |
1137 | ||
1138 | void operator()(const char_t) const | |
1139 | { | |
1140 | node* left = m_stack.top(); | |
1141 | m_stack.pop(); | |
1142 | ||
1143 | node* copy = left->clone(); | |
1144 | ||
1145 | node* new_star = new star_node(copy); | |
1146 | node* new_cat = new cat_node(left, new_star); | |
1147 | m_stack.push(new_cat); | |
1148 | } | |
1149 | ||
1150 | std::stack<node*>& m_stack; | |
1151 | }; | |
1152 | ||
1153 | template <typename ScannerT> | |
1154 | class make_optional | |
1155 | { | |
1156 | typedef typename ScannerT::iterator_t iterator_type; | |
1157 | ||
1158 | public: | |
1159 | typedef | |
1160 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1161 | char_t; | |
1162 | ||
1163 | make_optional(std::stack<node*>& the_stack) | |
1164 | : m_stack(the_stack) | |
1165 | {} | |
1166 | ||
1167 | void operator()(const char_t) const | |
1168 | { | |
1169 | node* left = m_stack.top(); | |
1170 | m_stack.pop(); | |
1171 | ||
1172 | node* new_or = new or_node(left, new epsilon_node()); | |
1173 | m_stack.push(new_or); | |
1174 | } | |
1175 | ||
1176 | std::stack<node*>& m_stack; | |
1177 | }; | |
1178 | ||
1179 | /////////////////////////////////////////////////////////////////////////////// | |
1180 | // utility function | |
1181 | template <typename CharT> | |
1182 | inline utility::impl::range<CharT> const& | |
1183 | full_range() | |
1184 | { | |
1185 | static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(), | |
1186 | (std::numeric_limits<CharT>::max)()); | |
1187 | return full; | |
1188 | } | |
1189 | ||
1190 | namespace ccl_utils | |
1191 | { | |
1192 | template <typename char_t> | |
1193 | inline utility::impl::range_run<char_t> | |
1194 | negate_range_run( | |
1195 | const utility::impl::range_run<char_t>& rr) | |
1196 | { | |
1197 | utility::impl::range_run<char_t> newrr; | |
1198 | newrr.set(full_range<char_t>()); | |
1199 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1200 | iter != rr.end(); ++iter) | |
1201 | newrr.clear(*iter); | |
1202 | return newrr; | |
1203 | } | |
1204 | ||
1205 | template <typename char_t> | |
1206 | inline node* | |
1207 | create_mb_node_seq(char_t c) | |
1208 | { | |
1209 | node* newnode = new char_node(get_byte(c, 0)); | |
1210 | for (unsigned int i = 1; i < sizeof(c); ++i) | |
1211 | { | |
1212 | node* cnode = new char_node(get_byte(c, i)); | |
1213 | node* top_node = new cat_node(newnode, cnode); | |
1214 | newnode = top_node; | |
1215 | } | |
1216 | return newnode; | |
1217 | } | |
1218 | ||
1219 | template <typename char_t> | |
1220 | inline void | |
1221 | handle_mb_char(char_t c, bool first_time, | |
1222 | std::stack<node*>& stack) | |
1223 | { | |
1224 | node* newnode = create_mb_node_seq(c); | |
1225 | ||
1226 | if (first_time) | |
1227 | { | |
1228 | stack.push(newnode); | |
1229 | } | |
1230 | else | |
1231 | { | |
1232 | node* top = stack.top(); | |
1233 | stack.pop(); | |
1234 | ||
1235 | node* newtop = new or_node(top, newnode); | |
1236 | stack.push(newtop); | |
1237 | } | |
1238 | } | |
1239 | ||
1240 | // forward decl only | |
1241 | template <typename char_t> | |
1242 | inline void | |
1243 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1244 | std::stack<node*>& stack); | |
1245 | ||
1246 | template <typename char_t> | |
1247 | inline void | |
1248 | create_nodes(const utility::impl::range_run<char_t>& rr, | |
1249 | std::stack<node*>& stack) | |
1250 | { | |
1251 | ||
1252 | if (sizeof(char_t) == 1) | |
1253 | { | |
1254 | std::vector<uchar> ccl; | |
1255 | ccl.resize(256); | |
1256 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1257 | iter != rr.end(); ++iter) | |
1258 | { | |
1259 | for (int i = iter->first; i <= iter->last; ++i) | |
1260 | { | |
1261 | // this is always true because of the limited datatype | |
1262 | // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256); | |
1263 | ccl[uchar(i)] = 1; | |
1264 | } | |
1265 | } | |
1266 | ||
1267 | node* new_ccl = new ccl_node(ccl); | |
1268 | stack.push(new_ccl); | |
1269 | } | |
1270 | else | |
1271 | { | |
1272 | bool mb_first_time = true; | |
1273 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1274 | iter != rr.end(); ++iter) | |
1275 | { | |
1276 | if (iter->first == iter->last) | |
1277 | { | |
1278 | handle_mb_char(iter->first, mb_first_time, stack); | |
1279 | } | |
1280 | else | |
1281 | { | |
1282 | handle_mb_range(iter->first, iter->last, mb_first_time, stack); | |
1283 | } | |
1284 | mb_first_time = false; | |
1285 | } | |
1286 | } | |
1287 | } | |
1288 | ||
1289 | template <typename char_t> | |
1290 | inline std::size_t | |
1291 | compute_differing_byte(char_t c1, char_t c2) | |
1292 | { | |
1293 | std::size_t rval = 0; | |
1294 | while (rval < sizeof(c1) && | |
1295 | get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval)) | |
1296 | { | |
1297 | ++rval; | |
1298 | } | |
1299 | return rval; | |
1300 | } | |
1301 | ||
1302 | template <typename char_t> | |
1303 | inline node* | |
1304 | create_mb_node_type1(std::size_t j, char_t c1, char_t c2) | |
1305 | { | |
1306 | std::size_t diff = get_byte(c2, (unsigned int)j) - | |
1307 | get_byte(c1, (unsigned int)j); | |
1308 | if (diff == 1) { | |
1309 | return 0; | |
1310 | } | |
1311 | else if (diff == 2) { | |
1312 | return new char_node(get_byte(c1, (unsigned int)j)+1); | |
1313 | } | |
1314 | else { | |
1315 | return new ccl_node(get_byte(c1, (unsigned int)j)+1, | |
1316 | get_byte(c2, (unsigned int)j)-1); | |
1317 | } | |
1318 | } | |
1319 | ||
1320 | template <typename char_t> | |
1321 | inline node * | |
1322 | create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1, | |
1323 | std::size_t differing_byte, char_t c1, char_t c2, node* newnode) | |
1324 | { | |
1325 | node* cnode; | |
1326 | if (i == sizem1 && j == differing_byte && j != sizem1) | |
1327 | { | |
1328 | node* tmp = create_mb_node_type1(j, c1, c2); | |
1329 | if (tmp == 0) | |
1330 | { | |
1331 | delete newnode; | |
1332 | return 0; | |
1333 | } | |
1334 | else | |
1335 | cnode = tmp; | |
1336 | } | |
1337 | else if (i == differing_byte && j == sizem1) | |
1338 | { | |
1339 | if (i != sizem1) { | |
1340 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF); | |
1341 | } | |
1342 | else { | |
1343 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), | |
1344 | get_byte(c2, (unsigned int)j)); | |
1345 | } | |
1346 | } | |
1347 | else if (i != differing_byte && i != sizem1 && | |
1348 | j == (sizem1 - i + differing_byte)) | |
1349 | { | |
1350 | cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF); | |
1351 | } | |
1352 | else if (i + j - differing_byte > sizem1) { | |
1353 | cnode = new ccl_node(0, 0xFF); | |
1354 | } | |
1355 | else {//if (is plain) | |
1356 | cnode = new char_node(get_byte(c1, (unsigned int)j)); | |
1357 | } | |
1358 | ||
1359 | node* top_node = new cat_node(newnode, cnode); | |
1360 | return top_node; | |
1361 | } | |
1362 | ||
1363 | // On platforms, where wchar_t is a typedef for unsigned short, the | |
1364 | // comparision for a negative value is pointless | |
1365 | template <bool is_signed> | |
1366 | struct correct_char_aux { | |
1367 | }; | |
1368 | ||
1369 | template <> | |
1370 | struct correct_char_aux<true> { | |
1371 | ||
1372 | template <typename char_t> | |
1373 | static char_t correct(char_t c) { if (c < 0) c = 0; return c; } | |
1374 | }; | |
1375 | ||
1376 | template <> | |
1377 | struct correct_char_aux<false> { | |
1378 | ||
1379 | template <typename char_t> | |
1380 | static char_t correct(char_t c) { return c; } | |
1381 | }; | |
1382 | ||
1383 | template <typename char_t> | |
1384 | struct correct_char | |
1385 | { | |
1386 | static char_t correct(char_t c) | |
1387 | { | |
1388 | return correct_char_aux<std::numeric_limits<char_t>::is_signed >:: | |
1389 | correct(c); | |
1390 | } | |
1391 | }; | |
1392 | ||
1393 | template <typename char_t> | |
1394 | inline void | |
1395 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1396 | std::stack<node*>& stack) | |
1397 | { | |
1398 | // The algorithm can't handle negative value chars, which don't make | |
1399 | // much sense anyway. This comparision is pointless for wchar_t's on | |
1400 | // platforms, where wchar_t is a typedef for unsigned short | |
1401 | ||
1402 | c1 = correct_char<char_t>::correct(c1); | |
1403 | //if (c1 < 0) | |
1404 | // c1 = 0; | |
1405 | ||
1406 | BOOST_ASSERT(c1 < c2); | |
1407 | node* newnode = 0; | |
1408 | node* savednode = 0; | |
1409 | const std::size_t differing_byte = compute_differing_byte(c1, c2); | |
1410 | const std::size_t sizem1 = sizeof(c1) - 1; | |
1411 | const std::size_t ndb = sizem1 - differing_byte; | |
1412 | for (std::size_t i = differing_byte; i < sizeof(c1); ++i) | |
1413 | { | |
1414 | // generate node for the first byte | |
1415 | if (differing_byte == 0 && i == ndb) | |
1416 | { | |
1417 | node* tmp = create_mb_node_type1(0, c1, c2); | |
1418 | if (tmp == 0) | |
1419 | continue; | |
1420 | else | |
1421 | newnode = tmp; | |
1422 | } | |
1423 | else | |
1424 | { | |
1425 | newnode = new char_node(get_byte(c1, 0)); | |
1426 | } | |
1427 | for (std::size_t j = 1; j < sizeof(c1); ++j) | |
1428 | { | |
1429 | newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, | |
1430 | c1, c2, newnode); | |
1431 | if (newnode == 0) | |
1432 | goto end_outer_for; | |
1433 | } | |
1434 | ||
1435 | // or together the various parts | |
1436 | if (savednode) | |
1437 | { | |
1438 | node* top_node = new or_node(savednode, newnode); | |
1439 | savednode = top_node; | |
1440 | } | |
1441 | else | |
1442 | { | |
1443 | savednode = newnode; | |
1444 | } | |
1445 | end_outer_for: | |
1446 | continue; | |
1447 | } | |
1448 | ||
1449 | for (std::size_t k = 0; k < ndb; ++k) | |
1450 | { | |
1451 | newnode = new char_node(get_byte(c2, 0)); | |
1452 | for (std::size_t j = 1; j < sizeof(c2); ++j) | |
1453 | { | |
1454 | node* cnode; | |
1455 | if (k == differing_byte && j == sizem1 && k != sizem1) | |
1456 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)); | |
1457 | ||
1458 | else if (k != differing_byte && k != sizem1 && | |
1459 | j == (sizem1 - k + differing_byte)) | |
1460 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1); | |
1461 | ||
1462 | else if (k + j - differing_byte > sizem1) | |
1463 | cnode = new ccl_node(0, 0xFF); | |
1464 | ||
1465 | else //if (is plain) | |
1466 | cnode = new char_node(get_byte(c2, (unsigned int)j)); | |
1467 | ||
1468 | ||
1469 | node* top_node = new cat_node(newnode, cnode); | |
1470 | newnode = top_node; | |
1471 | } | |
1472 | ||
1473 | // or together the various parts | |
1474 | if (savednode) | |
1475 | { | |
1476 | node* top_node = new or_node(savednode, newnode); | |
1477 | savednode = top_node; | |
1478 | } | |
1479 | else | |
1480 | { | |
1481 | savednode = newnode; | |
1482 | } | |
1483 | } | |
1484 | ||
1485 | ||
1486 | if (first_time) | |
1487 | { | |
1488 | stack.push(savednode); | |
1489 | } | |
1490 | else | |
1491 | { | |
1492 | node* top = stack.top(); | |
1493 | stack.pop(); | |
1494 | ||
1495 | node* newtop = new or_node(top, savednode); | |
1496 | stack.push(newtop); | |
1497 | } | |
1498 | } | |
1499 | } // namespace ccl_utils | |
1500 | ||
1501 | template <typename ScannerT> | |
1502 | class make_char | |
1503 | { | |
1504 | typedef typename ScannerT::iterator_t iterator_type; | |
1505 | ||
1506 | public: | |
1507 | typedef | |
1508 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1509 | char_t; | |
1510 | ||
1511 | make_char(std::stack<node*>& the_stack) | |
1512 | : m_stack(the_stack) | |
1513 | {} | |
1514 | ||
1515 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1516 | { | |
1517 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1518 | escape_char_parser<lex_escapes, char_t>(); | |
1519 | char_t the_char; | |
1520 | iterator_type first_ = first; | |
1521 | ScannerT scan(first_, last); | |
1522 | lex_escape_ch[assign(the_char)].parse(scan); | |
1523 | node* newnode = ccl_utils::create_mb_node_seq(the_char); | |
1524 | m_stack.push(newnode); | |
1525 | } | |
1526 | ||
1527 | std::stack<node*>& m_stack; | |
1528 | }; | |
1529 | ||
1530 | ||
1531 | template <typename ScannerT> | |
1532 | class make_ccl | |
1533 | { | |
1534 | typedef typename ScannerT::iterator_t iterator_type; | |
1535 | ||
1536 | public: | |
1537 | typedef | |
1538 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1539 | char_t; | |
1540 | ||
1541 | make_ccl(std::stack<node*>& the_stack) | |
1542 | : m_stack(the_stack) | |
1543 | {} | |
1544 | ||
1545 | static bool is_equal_to_string(iterator_type first, | |
1546 | iterator_type const & last, const char* str) | |
1547 | { | |
1548 | while (first != last &&*str &&*first ==*str) | |
1549 | { | |
1550 | ++first; | |
1551 | ++str; | |
1552 | } | |
1553 | return*str == 0; | |
1554 | } | |
1555 | ||
1556 | template <typename ParserT> | |
1557 | static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser) | |
1558 | { | |
1559 | for (int i = 0; i < 256; ++i) | |
1560 | { | |
1561 | if (parser.test(static_cast<char_t>(uchar(i)))) | |
1562 | rr.set(utility::impl::range<char_t>(char_t(i), char_t(i))); | |
1563 | } | |
1564 | } | |
1565 | ||
1566 | void operator()(iterator_type const& first_, iterator_type const& last) const | |
1567 | { | |
1568 | BOOST_ASSERT(*first_ == '['); | |
1569 | ||
1570 | iterator_type first = first_; | |
1571 | ++first; // skip over '[' | |
1572 | bool negated_ccl = false; | |
1573 | if (*first == '^') | |
1574 | { | |
1575 | negated_ccl = true; | |
1576 | ++first; | |
1577 | } | |
1578 | ||
1579 | utility::impl::range_run<char_t> rr; | |
1580 | while (first != last &&*first != ']') | |
1581 | { | |
1582 | if (*first == '[') // it's a ccl_expr like [:space:] | |
1583 | { | |
1584 | // check for [:space:], etc. | |
1585 | if (is_equal_to_string(first, last, "[:alnum:]")) | |
1586 | { | |
1587 | fill_ccl(rr, alnum_p); | |
1588 | } | |
1589 | else if (is_equal_to_string(first, last, "[:alpha:]")) | |
1590 | { | |
1591 | fill_ccl(rr, alpha_p); | |
1592 | } | |
1593 | else if (is_equal_to_string(first, last, "[:blank:]")) | |
1594 | { | |
1595 | fill_ccl(rr, blank_p); | |
1596 | } | |
1597 | else if (is_equal_to_string(first, last, "[:cntrl:]")) | |
1598 | { | |
1599 | fill_ccl(rr, cntrl_p); | |
1600 | } | |
1601 | else if (is_equal_to_string(first, last, "[:digit:]")) | |
1602 | { | |
1603 | fill_ccl(rr, digit_p); | |
1604 | } | |
1605 | else if (is_equal_to_string(first, last, "[:graph:]")) | |
1606 | { | |
1607 | fill_ccl(rr, graph_p); | |
1608 | } | |
1609 | else if (is_equal_to_string(first, last, "[:lower:]")) | |
1610 | { | |
1611 | fill_ccl(rr, lower_p); | |
1612 | } | |
1613 | else if (is_equal_to_string(first, last, "[:print:]")) | |
1614 | { | |
1615 | fill_ccl(rr, print_p); | |
1616 | } | |
1617 | else if (is_equal_to_string(first, last, "[:punct:]")) | |
1618 | { | |
1619 | fill_ccl(rr, punct_p); | |
1620 | } | |
1621 | else if (is_equal_to_string(first, last, "[:space:]")) | |
1622 | { | |
1623 | fill_ccl(rr, space_p); | |
1624 | } | |
1625 | else if (is_equal_to_string(first, last, "[:upper:]")) | |
1626 | { | |
1627 | fill_ccl(rr, upper_p); | |
1628 | } | |
1629 | else if (is_equal_to_string(first, last, "[:xdigit:]")) | |
1630 | { | |
1631 | fill_ccl(rr, xdigit_p); | |
1632 | } | |
1633 | // this can't happen, because it's parsed before we get here. | |
1634 | //else | |
1635 | // throw bad_regex(); | |
1636 | ||
1637 | // Advance past the character class expression | |
1638 | while (first != last &&*first != ']') | |
1639 | ++first; | |
1640 | BOOST_ASSERT(*first == ']'); | |
1641 | ++first; | |
1642 | } | |
1643 | else { | |
1644 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1645 | escape_char_parser<lex_escapes, char_t>(); | |
1646 | ||
1647 | char_t c1; | |
1648 | ScannerT scan(first, last); | |
1649 | lex_escape_ch[assign(c1)].parse(scan); | |
1650 | if (*scan.first == '-') // insert a range | |
1651 | { | |
1652 | ++scan.first; | |
1653 | char_t c2; | |
1654 | lex_escape_ch[assign(c2)].parse(scan); | |
1655 | BOOST_ASSERT(c1 < c2); // Throw exception? | |
1656 | rr.set(utility::impl::range<char_t>(c1, c2)); | |
1657 | } | |
1658 | else // insert 1 char | |
1659 | { | |
1660 | rr.set(utility::impl::range<char_t>(c1, c1)); | |
1661 | } | |
1662 | } | |
1663 | } | |
1664 | ||
1665 | if (negated_ccl) | |
1666 | { | |
1667 | rr = ccl_utils::negate_range_run(rr); | |
1668 | } | |
1669 | ||
1670 | ccl_utils::create_nodes(rr, m_stack); | |
1671 | } | |
1672 | ||
1673 | std::stack<node*>& m_stack; | |
1674 | }; | |
1675 | ||
1676 | template <typename ScannerT> | |
1677 | class make_any_char | |
1678 | { | |
1679 | typedef typename ScannerT::iterator_t iterator_type; | |
1680 | ||
1681 | public: | |
1682 | typedef | |
1683 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1684 | char_t; | |
1685 | ||
1686 | std::stack<node*>& m_stack; | |
1687 | ||
1688 | make_any_char(std::stack<node*>& the_stack) | |
1689 | : m_stack(the_stack) | |
1690 | {} | |
1691 | ||
1692 | void operator()(const char_t c) const | |
1693 | { | |
1694 | BOOST_ASSERT(c == '.'); | |
1695 | do_any_char(); | |
1696 | } | |
1697 | ||
1698 | void do_any_char() const | |
1699 | { | |
1700 | static utility::impl::range_run<char_t> rr; | |
1701 | rr.set(full_range<char_t>()); | |
1702 | char_t newline = '\n'; | |
1703 | rr.clear(utility::impl::range<char_t>(newline, newline)); | |
1704 | ||
1705 | ccl_utils::create_nodes(rr, m_stack); | |
1706 | } | |
1707 | }; | |
1708 | ||
1709 | template <typename ScannerT> | |
1710 | class make_string | |
1711 | { | |
1712 | typedef typename ScannerT::iterator_t iterator_type; | |
1713 | ||
1714 | public: | |
1715 | typedef | |
1716 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1717 | char_t; | |
1718 | ||
1719 | std::stack<node*>& m_stack; | |
1720 | ||
1721 | make_string(std::stack<node*>& the_stack) | |
1722 | : m_stack(the_stack) | |
1723 | {} | |
1724 | ||
1725 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1726 | { | |
1727 | BOOST_ASSERT(*first == '"'); | |
1728 | ||
1729 | iterator_type first_ = first; | |
1730 | ScannerT scan(first_, last); | |
1731 | ++scan.first; // skip over '"' | |
1732 | ||
1733 | // empty string not allowed | |
1734 | if (*scan.first == '"') | |
1735 | { | |
1736 | boost::throw_exception(bad_regex()); | |
1737 | } | |
1738 | ||
1739 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1740 | escape_char_parser<lex_escapes, char_t>(); | |
1741 | ||
1742 | char_t c; | |
1743 | lex_escape_ch[assign(c)].parse(scan); | |
1744 | node* top_node = ccl_utils::create_mb_node_seq(c); | |
1745 | ||
1746 | while (*scan.first != '"' && scan.first != scan.last) | |
1747 | { | |
1748 | lex_escape_ch[assign(c)].parse(scan); | |
1749 | node* cur_node = ccl_utils::create_mb_node_seq(c); | |
1750 | top_node = new cat_node(top_node, cur_node); | |
1751 | } | |
1752 | m_stack.push(top_node); | |
1753 | } | |
1754 | }; | |
1755 | ||
1756 | inline | |
1757 | node* repeat_node(node* n, int num) | |
1758 | { | |
1759 | node* list_of_nodes = n; | |
1760 | for (int i = 1; i < num; ++i) | |
1761 | { | |
1762 | list_of_nodes = new cat_node(list_of_nodes, n->clone()); | |
1763 | } | |
1764 | return list_of_nodes; | |
1765 | } | |
1766 | ||
1767 | inline | |
1768 | node* optional_node(node* n) | |
1769 | { | |
1770 | return new or_node(n, new epsilon_node()); | |
1771 | } | |
1772 | ||
1773 | template <typename ScannerT> | |
1774 | class make_rep1 | |
1775 | { | |
1776 | typedef typename ScannerT::iterator_t iterator_type; | |
1777 | ||
1778 | public: | |
1779 | typedef | |
1780 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1781 | char_t; | |
1782 | ||
1783 | std::stack<node*>& m_stack; | |
1784 | ||
1785 | make_rep1(std::stack<node*>& the_stack) | |
1786 | : m_stack(the_stack) | |
1787 | {} | |
1788 | ||
1789 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1790 | { | |
1791 | BOOST_ASSERT(*first == '{'); | |
1792 | ||
1793 | iterator_type first_ = first; | |
1794 | ScannerT scan(first_, last); | |
1795 | ++scan.first; // skip over '{' | |
1796 | ||
1797 | unsigned int count; | |
1798 | uint_p[assign(count)].parse(scan); | |
1799 | if (count == 0) | |
1800 | boost::throw_exception(bad_regex()); | |
1801 | ||
1802 | node* top_node = m_stack.top(); | |
1803 | m_stack.pop(); | |
1804 | top_node = repeat_node(top_node, count); | |
1805 | m_stack.push(top_node); | |
1806 | } | |
1807 | }; | |
1808 | ||
1809 | template <typename ScannerT> | |
1810 | class make_rep2 | |
1811 | { | |
1812 | typedef typename ScannerT::iterator_t iterator_type; | |
1813 | ||
1814 | public: | |
1815 | typedef | |
1816 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1817 | char_t; | |
1818 | ||
1819 | std::stack<node*>& m_stack; | |
1820 | ||
1821 | make_rep2(std::stack<node*>& the_stack) | |
1822 | : m_stack(the_stack) | |
1823 | {} | |
1824 | ||
1825 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1826 | { | |
1827 | BOOST_ASSERT(*first == '{'); | |
1828 | ||
1829 | iterator_type first_ = first; | |
1830 | ScannerT scan (first_, last); | |
1831 | ++scan.first; // skip over '{' | |
1832 | ||
1833 | unsigned int count; | |
1834 | uint_p[assign(count)].parse(scan); | |
1835 | if (count == 0) | |
1836 | boost::throw_exception(bad_regex()); | |
1837 | ||
1838 | node* top_node = m_stack.top(); | |
1839 | m_stack.pop(); | |
1840 | top_node = new cat_node(repeat_node(top_node, count), | |
1841 | new star_node(top_node->clone())); | |
1842 | m_stack.push(top_node); | |
1843 | ||
1844 | } | |
1845 | }; | |
1846 | ||
1847 | template <typename ScannerT> | |
1848 | class make_rep3 | |
1849 | { | |
1850 | typedef typename ScannerT::iterator_t iterator_type; | |
1851 | ||
1852 | public: | |
1853 | typedef | |
1854 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1855 | char_t; | |
1856 | ||
1857 | std::stack<node*>& m_stack; | |
1858 | ||
1859 | make_rep3(std::stack<node*>& the_stack) | |
1860 | : m_stack(the_stack) | |
1861 | {} | |
1862 | ||
1863 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1864 | { | |
1865 | BOOST_ASSERT(*first == '{'); | |
1866 | ||
1867 | iterator_type first_ = first; | |
1868 | ScannerT scan(first_, last); | |
1869 | ++scan.first; // skip over '{' | |
1870 | ||
1871 | unsigned int count1, count2; | |
1872 | uint_p[assign(count1)].parse(scan); | |
1873 | if (count1 == 0) | |
1874 | boost::throw_exception(bad_regex()); | |
1875 | ||
1876 | ++scan.first; // skip over ',' | |
1877 | ||
1878 | uint_p[assign(count2)].parse(scan); | |
1879 | if (count2 <= count1) | |
1880 | boost::throw_exception(bad_regex()); | |
1881 | ||
1882 | node* top_node = m_stack.top(); | |
1883 | m_stack.pop(); | |
1884 | node* repeats = repeat_node(top_node, count1); | |
1885 | top_node = new cat_node(repeats, | |
1886 | repeat_node(optional_node(top_node->clone()), | |
1887 | count2 - count1)); | |
1888 | ||
1889 | m_stack.push(top_node); | |
1890 | } | |
1891 | }; | |
1892 | ||
1893 | /////////////////////////////////////////////////////////////////////////////// | |
1894 | // | |
1895 | // Lexer grammar | |
1896 | // | |
1897 | // Defines the grammar, which mandates the syntax of the understood | |
1898 | // lexeme definitions passed to lexer::register_regex. | |
1899 | // | |
1900 | /////////////////////////////////////////////////////////////////////////////// | |
1901 | class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar> | |
1902 | { | |
1903 | public: | |
1904 | lexer_grammar(std::stack<node*> &node_stack_) | |
1905 | : node_stack(node_stack_) {} | |
1906 | ||
1907 | template <typename ScannerT> | |
1908 | struct definition | |
1909 | { | |
1910 | typedef rule<ScannerT> rule_t; | |
1911 | typedef typename ScannerT::iterator_t iterator_type; | |
1912 | typedef | |
1913 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1914 | char_t; | |
1915 | ||
1916 | rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, | |
1917 | escseq, ccl_char; | |
1918 | symbols<> ccl_expr; | |
1919 | ||
1920 | definition(lexer_grammar const &self) | |
1921 | { | |
1922 | regex = | |
1923 | re >> !('/' >> re) >> !ch_p('$') | |
1924 | ; | |
1925 | ||
1926 | re = | |
1927 | series | |
1928 | >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] ) | |
1929 | ; | |
1930 | ||
1931 | series = | |
1932 | singleton | |
1933 | >>*( singleton[make_concat<ScannerT>(self.node_stack)] ) | |
1934 | ; | |
1935 | ||
1936 | singleton = | |
1937 | ch_p('.')[make_any_char<ScannerT>(self.node_stack)] | |
1938 | >> singleton2 | |
1939 | | fullccl | |
1940 | >> singleton2 | |
1941 | | ('"' >> string >> '"') | |
1942 | [ | |
1943 | make_string<ScannerT>(self.node_stack) | |
1944 | ] | |
1945 | >> singleton2 | |
1946 | | '(' >> re >> ')' | |
1947 | >> singleton2 | |
1948 | | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) | |
1949 | [ | |
1950 | make_char<ScannerT>(self.node_stack) | |
1951 | ] | |
1952 | >> singleton2 | |
1953 | ; | |
1954 | ||
1955 | singleton2 = | |
1956 | ch_p('*')[make_star<ScannerT>(self.node_stack)] | |
1957 | >> singleton2 | |
1958 | | ch_p('+')[make_plus<ScannerT>(self.node_stack)] | |
1959 | >> singleton2 | |
1960 | | ch_p('?')[make_optional<ScannerT>(self.node_stack)] | |
1961 | >> singleton2 | |
1962 | | ('{' >> uint_p >> '}') | |
1963 | [ | |
1964 | make_rep1<ScannerT>(self.node_stack) | |
1965 | ] | |
1966 | >> singleton2 | |
1967 | | ('{' >> uint_p >> ',' >> '}') | |
1968 | [ | |
1969 | make_rep2<ScannerT>(self.node_stack) | |
1970 | ] | |
1971 | >> singleton2 | |
1972 | | ('{' >> uint_p >> ',' >> uint_p >> '}') | |
1973 | [ | |
1974 | make_rep3<ScannerT>(self.node_stack) | |
1975 | ] | |
1976 | >> singleton2 | |
1977 | | epsilon_p | |
1978 | ; | |
1979 | ||
1980 | fullccl = | |
1981 | ('[' >> !ch_p('^') >> ccl >> ']') | |
1982 | [ | |
1983 | make_ccl<ScannerT>(self.node_stack) | |
1984 | ] | |
1985 | ; | |
1986 | ||
1987 | ccl = | |
1988 | *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) | |
1989 | ; | |
1990 | ||
1991 | ccl_char = | |
1992 | ( (anychar_p - chset<>("\\\n]")) | escseq ) | |
1993 | ; | |
1994 | ||
1995 | ccl_expr = | |
1996 | "[:alnum:]", | |
1997 | "[:alpha:]", | |
1998 | "[:blank:]", | |
1999 | "[:cntrl:]", | |
2000 | "[:digit:]", | |
2001 | "[:graph:]", | |
2002 | "[:lower:]", | |
2003 | "[:print:]", | |
2004 | "[:punct:]", | |
2005 | "[:space:]", | |
2006 | "[:upper:]", | |
2007 | "[:xdigit:]" | |
2008 | ; | |
2009 | ||
2010 | string = | |
2011 | +( (anychar_p - chset<>("\"\\")) | escseq ) | |
2012 | ; | |
2013 | ||
2014 | typedef | |
2015 | uint_parser<char_t, 8, 1, | |
2016 | std::numeric_limits<char_t>::digits / 3 + 1 | |
2017 | > oct_parser_t; | |
2018 | typedef | |
2019 | uint_parser<char_t, 16, 1, | |
2020 | std::numeric_limits<char_t>::digits / 4 + 1 | |
2021 | > hex_parser_t; | |
2022 | ||
2023 | escseq = | |
2024 | ch_p('\\') | |
2025 | >> ( | |
2026 | oct_parser_t() | |
2027 | | as_lower_d['x'] >> hex_parser_t() | |
2028 | | (anychar_p - chset<>('\n')) | |
2029 | ) | |
2030 | ; | |
2031 | ||
2032 | #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2033 | ||
2034 | BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); | |
2035 | BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); | |
2036 | BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); | |
2037 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); | |
2038 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); | |
2039 | BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); | |
2040 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); | |
2041 | BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); | |
2042 | BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); | |
2043 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); | |
2044 | ||
2045 | #undef BOOST_SLEX_DEBUG | |
2046 | } | |
2047 | ||
2048 | rule<ScannerT> const& | |
2049 | start() const { return regex; } | |
2050 | }; | |
2051 | ||
2052 | std::stack<node*> &node_stack; | |
2053 | ||
2054 | }; // class lexer_grammar | |
2055 | ||
2056 | template <typename StringT> | |
2057 | inline node * | |
2058 | parse(lexer_grammar& g, StringT const& str) | |
2059 | { | |
2060 | typedef | |
2061 | scanner<typename StringT::const_iterator, scanner_policies<> > | |
2062 | scanner_t; | |
2063 | typedef typename rule<scanner_t>::template result<scanner_t>::type | |
2064 | result_t; | |
2065 | ||
2066 | typename StringT::const_iterator first = str.begin(); | |
2067 | typename StringT::const_iterator last = str.end(); | |
2068 | ||
2069 | scanner_t scan(first, last); | |
2070 | // typename rule<scanner_t>::result_t hit = g.parse(scan); | |
2071 | result_t hit = g.parse(scan); | |
2072 | if (!hit || !scan.at_end()) | |
2073 | { | |
2074 | while (g.node_stack.size()) | |
2075 | { | |
2076 | delete g.node_stack.top(); | |
2077 | g.node_stack.pop(); | |
2078 | } | |
2079 | return 0; | |
2080 | } | |
2081 | ||
2082 | BOOST_ASSERT(g.node_stack.size() == 1); | |
2083 | node* rval = g.node_stack.top(); | |
2084 | g.node_stack.pop(); | |
2085 | node* an_eof_node = new eof_node(); | |
2086 | rval = new cat_node(rval, an_eof_node); | |
2087 | return rval; | |
2088 | } | |
2089 | ||
2090 | inline | |
2091 | void make_case_insensitive(state_match_t& state_match) | |
2092 | { | |
2093 | // TODO: Fix this. | |
2094 | // This doesn't take into account foreign languages, figure out how to | |
2095 | // do that. Also this approach is broken for this implementation of | |
2096 | // wide chars. | |
2097 | for (state_match_t::iterator iter = state_match.begin(); | |
2098 | iter != state_match.end(); ++iter) | |
2099 | { | |
2100 | int i, j; | |
2101 | for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) | |
2102 | { | |
2103 | // if either is set, turn them both on | |
2104 | (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); | |
2105 | } | |
2106 | } | |
2107 | } | |
2108 | ||
2109 | template<bool wide_char> | |
2110 | struct regex_match_helper; | |
2111 | ||
2112 | template<> | |
2113 | struct regex_match_helper<false> // single byte char | |
2114 | { | |
2115 | template <typename DfaT, typename IteratorT> | |
2116 | static bool | |
2117 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2118 | int& regex_index, | |
2119 | std::basic_string< | |
2120 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2121 | > *token) | |
2122 | { | |
2123 | typedef std::basic_string< | |
2124 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2125 | > string_type; | |
2126 | typedef typename string_type::size_type size_type; | |
2127 | ||
2128 | node_id_t s = 0; | |
2129 | node_id_t last_accepting_index = invalid_node; | |
2130 | IteratorT p = first; | |
2131 | IteratorT last_accepting_cpos = first; | |
2132 | while (p != last) | |
2133 | { | |
2134 | s = dfa.transition_table[s][(uchar)*p]; | |
2135 | if (s == invalid_node) | |
2136 | break; | |
2137 | if (token) token->append((size_type)1, *p); | |
2138 | ++p; | |
2139 | if (dfa.acceptance_index[s] != invalid_node) | |
2140 | { | |
2141 | last_accepting_index = s; | |
2142 | last_accepting_cpos = p; | |
2143 | } | |
2144 | } | |
2145 | if (last_accepting_index != invalid_node) | |
2146 | { | |
2147 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2148 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2149 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2150 | #endif | |
2151 | ||
2152 | first = last_accepting_cpos; | |
2153 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2154 | return true; | |
2155 | } | |
2156 | else | |
2157 | return false; | |
2158 | } | |
2159 | }; | |
2160 | ||
2161 | template<> | |
2162 | struct regex_match_helper<true> // wide char | |
2163 | { | |
2164 | template <typename DfaT, typename IteratorT> | |
2165 | static bool | |
2166 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2167 | int& regex_index, | |
2168 | std::basic_string< | |
2169 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2170 | > *token) | |
2171 | { | |
2172 | typedef | |
2173 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2174 | char_t; | |
2175 | typedef std::basic_string<char_t> string_type; | |
2176 | typedef typename string_type::size_type size_type; | |
2177 | ||
2178 | node_id_t s = 0; | |
2179 | node_id_t last_accepting_index = invalid_node; | |
2180 | IteratorT wp = first; | |
2181 | IteratorT last_accepting_cpos = first; | |
2182 | ||
2183 | while (wp != last) | |
2184 | { | |
2185 | for (unsigned int i = 0; i < sizeof(char_t); ++i) | |
2186 | { | |
2187 | s = dfa.transition_table[s][get_byte(*wp,i)]; | |
2188 | if (s == invalid_node) | |
2189 | { | |
2190 | goto break_while; | |
2191 | } | |
2192 | } | |
2193 | if (token) token->append((size_type)1, *wp); | |
2194 | ++wp; | |
2195 | if (dfa.acceptance_index[s] != invalid_node) | |
2196 | { | |
2197 | last_accepting_index = s; | |
2198 | last_accepting_cpos = wp; | |
2199 | } | |
2200 | ||
2201 | } | |
2202 | ||
2203 | break_while: | |
2204 | if (last_accepting_index != invalid_node) | |
2205 | { | |
2206 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2207 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2208 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2209 | #endif | |
2210 | first = last_accepting_cpos; | |
2211 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2212 | ||
2213 | return true; | |
2214 | } | |
2215 | else | |
2216 | return false; | |
2217 | } | |
2218 | }; | |
2219 | ||
2220 | template <typename DfaT, typename IteratorT, bool wide_char> | |
2221 | struct regex_match | |
2222 | { | |
2223 | static bool | |
2224 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2225 | int& regex_index, | |
2226 | std::basic_string< | |
2227 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2228 | > *token) | |
2229 | { | |
2230 | return regex_match_helper<wide_char>::do_match( | |
2231 | dfa, first, last, regex_index, token); | |
2232 | } | |
2233 | }; | |
2234 | ||
2235 | } // namespace lexerimpl | |
2236 | ||
2237 | /////////////////////////////////////////////////////////////////////////////// | |
2238 | // | |
2239 | template <typename IteratorT = char const*, typename TokenT = int, | |
2240 | typename CallbackT = void(*)(IteratorT const &, | |
2241 | IteratorT &, | |
2242 | IteratorT const&, | |
2243 | TokenT const&, | |
2244 | lexer_control<TokenT> &)> | |
2245 | class lexer | |
2246 | { | |
2247 | public: | |
2248 | typedef CallbackT callback_t; | |
2249 | typedef | |
2250 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2251 | char_t; | |
2252 | ||
2253 | struct regex_info | |
2254 | { | |
2255 | std::basic_string<char_t> str; | |
2256 | TokenT token; | |
2257 | CallbackT callback; | |
2258 | ||
2259 | regex_info(const std::basic_string<char_t>& _str, | |
2260 | const TokenT& _token, | |
2261 | const CallbackT& _callback) | |
2262 | : str(_str) | |
2263 | , token(_token) | |
2264 | , callback(_callback) | |
2265 | {} | |
2266 | ||
2267 | }; | |
2268 | ||
2269 | struct dfa_table | |
2270 | { | |
2271 | std::vector<std::vector<node_id_t> > transition_table; | |
2272 | std::vector<node_id_t> acceptance_index; | |
2273 | }; | |
2274 | typedef std::vector<node_id_t> node_table_t; | |
2275 | typedef std::vector<node_table_t> transition_table_t; | |
2276 | typedef std::vector<dfa_table> dfa_t; | |
2277 | ||
2278 | ||
2279 | lexer(unsigned int states = 1); | |
2280 | ||
2281 | void register_regex(const std::basic_string<char_t>& regex, | |
2282 | const TokenT& id, const CallbackT& cb = CallbackT(), | |
2283 | unsigned int state = 0); | |
2284 | ||
2285 | TokenT next_token(IteratorT &first, IteratorT const &last, | |
2286 | std::basic_string<char_t> *token = 0); | |
2287 | ||
2288 | void create_dfa(); | |
2289 | bool has_compiled_dfa() { return m_compiled_dfa; } | |
2290 | ||
2291 | void set_case_insensitive(bool insensitive); | |
2292 | ||
2293 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2294 | void dump(std::ostream& out); | |
2295 | #endif | |
2296 | typedef std::vector<std::vector<regex_info> > regex_list_t; | |
2297 | ||
2298 | bool load (std::ifstream &in, long unique_id = 0); | |
2299 | bool save (std::ofstream &out, long unique_id = 0); | |
2300 | enum { | |
2301 | SLEX_SIGNATURE = 0x58454C53, // "SLEX" | |
2302 | SLEX_VERSION_100 = 0x0100, // persistance version | |
2303 | SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, | |
2304 | SLEX_MINOR_VERSION_MASK = 0xFF | |
2305 | }; | |
2306 | ||
2307 | private: | |
2308 | ||
2309 | void create_dfa_for_state(int state); | |
2310 | ||
2311 | static bool regex_match(const dfa_t& dfa, IteratorT& first, | |
2312 | IteratorT& last, int& regex_index); | |
2313 | ||
2314 | mutable std::stack<lexerimpl::node*> node_stack; | |
2315 | lexerimpl::lexer_grammar g; | |
2316 | ||
2317 | mutable bool m_compiled_dfa; | |
2318 | mutable dfa_t m_dfa; | |
2319 | ||
2320 | regex_list_t m_regex_list; | |
2321 | bool m_case_insensitive; | |
2322 | ||
2323 | unsigned int m_state; | |
2324 | std::stack<unsigned int> m_state_stack; | |
2325 | unsigned int m_num_states; | |
2326 | }; | |
2327 | ||
2328 | ||
2329 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2330 | inline | |
2331 | lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) | |
2332 | : g(node_stack) | |
2333 | , m_compiled_dfa(false) | |
2334 | , m_regex_list(states) | |
2335 | , m_case_insensitive(false) | |
2336 | , m_state(0) | |
2337 | , m_state_stack() | |
2338 | , m_num_states(states) | |
2339 | { | |
2340 | BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", | |
2341 | BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX); | |
2342 | } | |
2343 | ||
2344 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2345 | inline void | |
2346 | lexer<IteratorT, TokenT, CallbackT>::register_regex( | |
2347 | const std::basic_string<char_t>& regex, const TokenT& id, | |
2348 | const CallbackT& callback, unsigned int state) | |
2349 | { | |
2350 | if (state > m_num_states) { | |
2351 | m_regex_list.resize(state); | |
2352 | m_num_states = state; | |
2353 | } | |
2354 | m_regex_list[state].push_back(regex_info(regex, id, callback)); | |
2355 | } | |
2356 | ||
2357 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2358 | inline TokenT | |
2359 | lexer<IteratorT, TokenT, CallbackT>::next_token( | |
2360 | IteratorT &first, IteratorT const& last, | |
2361 | std::basic_string< | |
2362 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2363 | > *token) | |
2364 | { | |
2365 | if (!m_compiled_dfa) | |
2366 | { | |
2367 | create_dfa(); | |
2368 | } | |
2369 | ||
2370 | IteratorT saved = first; | |
2371 | int regex_index; | |
2372 | if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: | |
2373 | do_match(m_dfa[m_state], first, last, regex_index, token)) | |
2374 | return -1; // TODO: can't return -1, need to return some invalid token. | |
2375 | // how to figure this out? We can use traits I guess. | |
2376 | else | |
2377 | { | |
2378 | regex_info regex = m_regex_list[m_state][regex_index]; | |
2379 | TokenT rval = regex.token; | |
2380 | if (regex.callback) | |
2381 | { | |
2382 | // execute corresponding callback | |
2383 | lexer_control<TokenT> controller(rval, m_state, m_state_stack); | |
2384 | regex.callback(saved, first, last, regex.token, controller); | |
2385 | if (controller.ignore_current_token_set()) { | |
2386 | if (token) | |
2387 | token->erase(); | |
2388 | return next_token(first, last, token); | |
2389 | } | |
2390 | } | |
2391 | return rval; | |
2392 | } | |
2393 | } | |
2394 | ||
2395 | namespace lexerimpl | |
2396 | { | |
2397 | ||
2398 | inline | |
2399 | bool find_acceptance_state(const node_set& eof_node_ids, | |
2400 | const node_set& current_set, | |
2401 | node_id_t& acceptance_node_id) | |
2402 | { | |
2403 | for(node_set::const_iterator nsi = eof_node_ids.begin(); | |
2404 | nsi != eof_node_ids.end(); ++nsi) | |
2405 | { | |
2406 | node_id_t eof_node_id =*nsi; | |
2407 | if (current_set.end() != current_set.find(eof_node_id)) | |
2408 | { | |
2409 | // store the first one we come to as the | |
2410 | // matched pattern | |
2411 | acceptance_node_id = eof_node_id; | |
2412 | // don't bother searching for more | |
2413 | return true; | |
2414 | } | |
2415 | } | |
2416 | return false; | |
2417 | } | |
2418 | ||
2419 | template <typename RegexListT, typename GrammarT> | |
b32b8144 FG |
2420 | #ifdef BOOST_NO_AUTO_PTR |
2421 | inline std::unique_ptr<node> | |
2422 | #else | |
7c673cae | 2423 | inline std::auto_ptr<node> |
b32b8144 | 2424 | #endif |
7c673cae FG |
2425 | parse_regexes(const RegexListT& regex_list, GrammarT& g) |
2426 | { | |
2427 | // parse the expressions into a tree | |
2428 | if (regex_list.begin() == regex_list.end()) | |
2429 | boost::throw_exception(bad_regex()); | |
2430 | ||
2431 | typename RegexListT::const_iterator ri = regex_list.begin(); | |
b32b8144 FG |
2432 | #ifdef BOOST_NO_AUTO_PTR |
2433 | std::unique_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); | |
2434 | #else | |
7c673cae | 2435 | std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); |
b32b8144 | 2436 | #endif |
7c673cae FG |
2437 | if (tree.get() == 0) |
2438 | boost::throw_exception(bad_regex()); | |
2439 | ||
2440 | ++ri; | |
2441 | for (/**/; ri != regex_list.end(); ++ri) | |
2442 | { | |
b32b8144 FG |
2443 | #ifdef BOOST_NO_AUTO_PTR |
2444 | std::unique_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); | |
2445 | #else | |
7c673cae | 2446 | std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); |
b32b8144 | 2447 | #endif |
7c673cae FG |
2448 | if (next_tree.get() == 0) |
2449 | boost::throw_exception(bad_regex()); | |
b32b8144 FG |
2450 | #ifdef BOOST_NO_AUTO_PTR |
2451 | tree = std::unique_ptr<node>(new or_node(tree.release(), next_tree.release())); | |
2452 | #else | |
2453 | tree = std::auto_ptr<node>(new or_node(tree.release(), next_tree.release())); | |
2454 | #endif | |
7c673cae FG |
2455 | } |
2456 | return tree; | |
2457 | } | |
2458 | ||
2459 | } //namespace lexerimpl | |
2460 | ||
2461 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2462 | inline void | |
2463 | lexer<IteratorT, TokenT, CallbackT>::create_dfa() | |
2464 | { | |
2465 | m_dfa.resize(m_num_states); | |
2466 | for (unsigned int i = 0; i < m_num_states; ++i) | |
2467 | create_dfa_for_state(i); | |
2468 | } | |
2469 | ||
2470 | // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 | |
2471 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2472 | inline void | |
2473 | lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state) | |
2474 | { | |
2475 | using lexerimpl::node; | |
b32b8144 FG |
2476 | #ifdef BOOST_NO_AUTO_PTR |
2477 | std::unique_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); | |
2478 | #else | |
7c673cae | 2479 | std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); |
b32b8144 | 2480 | #endif |
7c673cae FG |
2481 | node_id_t dummy = 0; |
2482 | tree->assign_node_ids(dummy); | |
2483 | ||
2484 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2485 | tree->dump(std::cout); | |
2486 | #endif | |
2487 | ||
2488 | // compute followpos(root) | |
2489 | followpos_t followpos; | |
2490 | tree->compute_followpos(followpos); | |
2491 | ||
2492 | // the dfa states <-> nfa state groups | |
2493 | std::map<node_set, node_id_t> dstates1; | |
2494 | std::map<node_id_t, node_set> dstates2; | |
2495 | ||
2496 | // the dfa transitions | |
2497 | m_dfa[state].transition_table.push_back( | |
2498 | std::vector<node_id_t>(256, invalid_node)); | |
2499 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2500 | ||
2501 | // whether the dfa state has been processed yet | |
2502 | std::vector<node_id_t> marked; | |
2503 | ||
2504 | // used to give a unique id to each dfa state | |
2505 | node_id_t num_states = 0; | |
2506 | ||
2507 | // initially, the only unmarked state in Dstates is firstpos(root). | |
2508 | marked.push_back(0); | |
2509 | node_set fpr = tree->firstpos(); | |
2510 | dstates1[fpr] = 0; | |
2511 | dstates2[0] = fpr; | |
2512 | state_match_t state_match; | |
2513 | tree->compute_state_match(state_match); | |
2514 | ||
2515 | if (m_case_insensitive) | |
2516 | lexerimpl::make_case_insensitive(state_match); | |
2517 | ||
2518 | node_set eof_node_ids; | |
2519 | tree->get_eof_ids(eof_node_ids); | |
2520 | // translate the eof_node_ids into a 0-based index | |
2521 | std::map<node_id_t, node_id_t> eof_node_id_map; | |
2522 | unsigned int x = 0; | |
2523 | for (node_set::iterator node_id_it = eof_node_ids.begin(); | |
2524 | node_id_it != eof_node_ids.end(); | |
2525 | ++node_id_it) | |
2526 | { | |
2527 | eof_node_id_map[*node_id_it] = x++; | |
2528 | } | |
2529 | ||
2530 | // figure out if this is an acceptance state | |
2531 | node_id_t eof_node_id; | |
2532 | if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) | |
2533 | { | |
2534 | m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; | |
2535 | } | |
2536 | ||
2537 | std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(), | |
2538 | node_id_t(0)); | |
2539 | while (marked.end() != i) | |
2540 | { | |
2541 | *i = 1; | |
2542 | node_id_t T = node_id_t(std::distance(marked.begin(), i)); | |
2543 | BOOST_ASSERT(T < dstates2.size()); | |
2544 | node_set Tstates = dstates2[T]; | |
2545 | for (node_id_t j = 0; j < 256; ++j) | |
2546 | { | |
2547 | node_set U; | |
2548 | for (node_set::iterator k = Tstates.begin(); | |
2549 | k != Tstates.end(); ++k) | |
2550 | { | |
2551 | node_id_t p =*k; | |
2552 | BOOST_ASSERT(p < state_match.size()); | |
2553 | BOOST_ASSERT(j < state_match[p].size()); | |
2554 | if (state_match[p][j]) | |
2555 | { | |
2556 | node_set fpp = followpos[p]; | |
2557 | U.insert(fpp.begin(), fpp.end()); | |
2558 | } | |
2559 | } | |
2560 | if (U.size() > 0) | |
2561 | { | |
2562 | std::map<node_set, node_id_t>::iterator l = dstates1.find(U); | |
2563 | node_id_t target_state; | |
2564 | if (l == dstates1.end()) // not in the states yet | |
2565 | { | |
2566 | ++num_states; | |
2567 | dstates1[U] = target_state = num_states; | |
2568 | dstates2[target_state] = U; | |
2569 | marked.push_back(0); | |
2570 | m_dfa[state].transition_table.push_back( | |
2571 | std::vector<node_id_t>(256, invalid_node)); | |
2572 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2573 | // figure out if this is an acceptance state | |
2574 | node_id_t eof_node_id; | |
2575 | if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) | |
2576 | { | |
2577 | m_dfa[state].acceptance_index[target_state] = | |
2578 | eof_node_id_map[eof_node_id]; | |
2579 | } | |
2580 | } | |
2581 | else | |
2582 | { | |
2583 | target_state = dstates1[U]; | |
2584 | } | |
2585 | ||
2586 | BOOST_ASSERT(T < m_dfa[state].transition_table.size()); | |
2587 | BOOST_ASSERT(j < m_dfa[state].transition_table[T].size()); | |
2588 | m_dfa[state].transition_table[T][j] = target_state; | |
2589 | } | |
2590 | ||
2591 | } | |
2592 | ||
2593 | i = std::find(marked.begin(), marked.end(), node_id_t(0)); | |
2594 | } | |
2595 | m_compiled_dfa = true; | |
2596 | } | |
2597 | ||
2598 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2599 | inline void | |
2600 | lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive) | |
2601 | { | |
2602 | m_case_insensitive = insensitive; | |
2603 | } | |
2604 | ||
2605 | ||
2606 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2607 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2608 | inline void | |
2609 | lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out) | |
2610 | { | |
2611 | for (unsigned x = 0; x < m_dfa.size(); ++x) | |
2612 | { | |
2613 | out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; | |
2614 | for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) | |
2615 | { | |
2616 | out << "state " << i << ":"; | |
2617 | for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) | |
2618 | { | |
2619 | if (m_dfa[x].transition_table[i][j] != invalid_node) | |
2620 | out << j << "->" << m_dfa[x].transition_table[i][j] << " "; | |
2621 | } | |
2622 | out << "\n"; | |
2623 | } | |
2624 | out << "acceptance states: "; | |
2625 | for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) | |
2626 | { | |
2627 | if (m_dfa[x].acceptance_index[k] != invalid_node) | |
2628 | out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; | |
2629 | } | |
2630 | out << endl; | |
2631 | } | |
2632 | } | |
2633 | #endif | |
2634 | ||
2635 | /////////////////////////////////////////////////////////////////////////////// | |
2636 | // load the lexer tables | |
2637 | #define slex_in(strm, val) \ | |
2638 | strm.read((char*)&val, sizeof(val)); \ | |
2639 | if (std::ios::goodbit != strm.rdstate()) return false | |
2640 | ||
2641 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2642 | inline bool | |
2643 | lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id) | |
2644 | { | |
2645 | // ensure correct signature and version | |
2646 | long signature = 0; | |
2647 | ||
2648 | slex_in (in, signature); | |
2649 | if (signature != SLEX_SIGNATURE) | |
2650 | return false; // not for us | |
2651 | ||
2652 | long version = 0; | |
2653 | ||
2654 | slex_in (in, version); | |
2655 | if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) | |
2656 | return false; // to new for us | |
2657 | ||
2658 | long uid = 0; | |
2659 | ||
2660 | slex_in (in, uid); | |
2661 | if (uid != unique_id) | |
2662 | return false; // not saved by us | |
2663 | ||
2664 | // load auxiliary members | |
2665 | int num_states = 0; | |
2666 | ||
2667 | slex_in (in, num_states); | |
2668 | ||
2669 | // load the dfa tables | |
2670 | dfa_t in_dfa; | |
2671 | std::size_t dfa_size = 0; | |
2672 | ||
2673 | slex_in (in, dfa_size); | |
2674 | in_dfa.resize(dfa_size); | |
2675 | for (std::size_t dfa = 0; dfa < dfa_size; ++dfa) | |
2676 | { | |
2677 | // load the transition tables | |
2678 | std::size_t tt_size = 0; | |
2679 | transition_table_t &tt_table = in_dfa[dfa].transition_table; | |
2680 | ||
2681 | slex_in (in, tt_size); | |
2682 | tt_table.resize(tt_size); | |
2683 | for (std::size_t tt = 0; tt < tt_size; ++tt) | |
2684 | { | |
2685 | std::size_t nt_size = 0; | |
2686 | node_table_t &nt_table = tt_table[tt]; | |
2687 | ||
2688 | slex_in (in, nt_size); | |
2689 | nt_table.resize(nt_size); | |
2690 | for (std::size_t nt = 0; nt < nt_size; ++nt) | |
2691 | { | |
2692 | slex_in (in, nt_table[nt]); | |
2693 | } | |
2694 | } | |
2695 | ||
2696 | // load the acceptance index table | |
2697 | std::size_t ai_size = 0; | |
2698 | node_table_t &ai_table = in_dfa[dfa].acceptance_index; | |
2699 | ||
2700 | slex_in (in, ai_size); | |
2701 | ai_table.resize(ai_size); | |
2702 | for (std::size_t ai = 0; ai < ai_size; ++ai) | |
2703 | { | |
2704 | slex_in (in, ai_table[ai]); | |
2705 | } | |
2706 | } | |
2707 | ||
2708 | m_dfa.swap(in_dfa); // success, swap in the read values | |
2709 | m_num_states = num_states; | |
2710 | ||
2711 | m_compiled_dfa = true; | |
2712 | return true; | |
2713 | } | |
2714 | ||
2715 | #undef slex_in | |
2716 | ||
2717 | /////////////////////////////////////////////////////////////////////////////// | |
2718 | // save the lexer tables | |
2719 | #define slex_out(strm, val) \ | |
2720 | strm.write((char*)&val, sizeof(val)); \ | |
2721 | if (std::ios::goodbit != strm.rdstate()) return false | |
2722 | ||
2723 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2724 | inline bool | |
2725 | lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id) | |
2726 | { | |
2727 | // save signature and version information | |
2728 | long out_long = SLEX_SIGNATURE; | |
2729 | ||
2730 | slex_out(out, out_long); | |
2731 | out_long = SLEX_VERSION_100; | |
2732 | slex_out(out, out_long); | |
2733 | slex_out(out, unique_id); | |
2734 | ||
2735 | // save auxiliary members | |
2736 | slex_out(out, m_num_states); | |
2737 | ||
2738 | // save the dfa tables | |
2739 | typedef typename dfa_t::const_iterator dfa_iter_t; | |
2740 | typedef transition_table_t::const_iterator transition_table_iter_t; | |
2741 | typedef node_table_t::const_iterator node_table_iter_t; | |
2742 | ||
2743 | std::size_t out_size_t = m_dfa.size(); | |
2744 | slex_out(out, out_size_t); | |
2745 | ||
2746 | dfa_iter_t end = m_dfa.end(); | |
2747 | for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) | |
2748 | { | |
2749 | // save the transition table | |
2750 | out_size_t = (*it).transition_table.size(); | |
2751 | slex_out(out, out_size_t); | |
2752 | ||
2753 | transition_table_iter_t tt_end = (*it).transition_table.end(); | |
2754 | for (transition_table_iter_t tt_it = (*it).transition_table.begin(); | |
2755 | tt_it != tt_end; | |
2756 | ++tt_it) | |
2757 | { | |
2758 | out_size_t = (*tt_it).size(); | |
2759 | slex_out(out, out_size_t); | |
2760 | ||
2761 | node_table_iter_t nt_end = (*tt_it).end(); | |
2762 | for (node_table_iter_t nt_it = (*tt_it).begin(); | |
2763 | nt_it != nt_end; | |
2764 | ++nt_it) | |
2765 | { | |
2766 | slex_out(out, (*nt_it)); | |
2767 | } | |
2768 | } | |
2769 | ||
2770 | // save the acceptance index table | |
2771 | out_size_t = (*it).acceptance_index.size(); | |
2772 | slex_out(out, out_size_t); | |
2773 | ||
2774 | node_table_iter_t nt_end = (*it).acceptance_index.end(); | |
2775 | for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); | |
2776 | nt_it != nt_end; | |
2777 | ++nt_it) | |
2778 | { | |
2779 | slex_out(out, (*nt_it)); | |
2780 | } | |
2781 | } | |
2782 | return true; | |
2783 | } | |
2784 | ||
2785 | #undef slex_out | |
2786 | ||
2787 | /* | |
2788 | a lexer_control object supports some operations on the lexer. | |
2789 | get current lexer state | |
2790 | set state | |
2791 | terminate | |
2792 | state stack (push, pop, top) | |
2793 | set new token return value | |
2794 | ignore the current token | |
2795 | yymore | |
2796 | get length of matched token | |
2797 | */ | |
2798 | template <typename TokenT> | |
2799 | class lexer_control | |
2800 | { | |
2801 | public: | |
2802 | ||
2803 | lexer_control(TokenT& token, unsigned int& current_state, | |
2804 | std::stack<unsigned int>& state_stack); | |
2805 | // functions dealing with the lexer state | |
2806 | ||
2807 | // set the state to state | |
2808 | void set_state(unsigned int state); | |
2809 | ||
2810 | // get the current state | |
2811 | unsigned int get_state(); | |
2812 | ||
2813 | // pushes the current state onto the top of the state stack and | |
2814 | // switches to new_state | |
2815 | void push_state(unsigned int new_state); | |
2816 | ||
2817 | // pops the top of the state stack and switches to it. | |
2818 | void pop_state(); | |
2819 | ||
2820 | // returns the top of the stack without altering the stack's contents. | |
2821 | unsigned int top_state(); | |
2822 | ||
2823 | // functions dealing with the token returned. | |
2824 | ||
2825 | // set a new TokenT return value, overriding that one that was | |
2826 | // registered via. register_regex() | |
2827 | void set_token(const TokenT& token); | |
2828 | ||
2829 | // tell the lexer to return an invalid token, signifying termination. | |
2830 | void terminate(); | |
2831 | ||
2832 | // ignore the current token, and move on to the next one. The current | |
2833 | // token will NOT be returned from next_token() | |
2834 | void ignore_current_token(); | |
2835 | ||
2836 | // returns true if ignore_current_token() has been called, | |
2837 | // false otherwise. | |
2838 | bool ignore_current_token_set(); | |
2839 | ||
2840 | private: | |
2841 | TokenT& m_token; | |
2842 | bool m_ignore_current_token; | |
2843 | unsigned int& m_current_state; | |
2844 | std::stack<unsigned int>& m_state_stack; | |
2845 | }; | |
2846 | ||
2847 | template <typename TokenT> | |
2848 | inline | |
2849 | lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state, | |
2850 | std::stack<unsigned int>& state_stack) | |
2851 | : m_token(token) | |
2852 | , m_ignore_current_token(false) | |
2853 | , m_current_state(current_state) | |
2854 | , m_state_stack(state_stack) | |
2855 | { | |
2856 | } | |
2857 | ||
2858 | template <typename TokenT> | |
2859 | inline void | |
2860 | lexer_control<TokenT>::set_state(unsigned int state) | |
2861 | { | |
2862 | m_current_state = state; | |
2863 | } | |
2864 | ||
2865 | template <typename TokenT> | |
2866 | inline unsigned int | |
2867 | lexer_control<TokenT>::get_state() | |
2868 | { | |
2869 | return m_current_state; | |
2870 | } | |
2871 | ||
2872 | template <typename TokenT> | |
2873 | inline void | |
2874 | lexer_control<TokenT>::push_state(unsigned int new_state) | |
2875 | { | |
2876 | m_state_stack.push(m_current_state); | |
2877 | m_current_state = new_state; | |
2878 | } | |
2879 | ||
2880 | template <typename TokenT> | |
2881 | inline void | |
2882 | lexer_control<TokenT>::pop_state() | |
2883 | { | |
2884 | m_current_state = m_state_stack.top(); | |
2885 | m_state_stack.pop(); | |
2886 | } | |
2887 | ||
2888 | template <typename TokenT> | |
2889 | inline unsigned int | |
2890 | lexer_control<TokenT>::top_state() | |
2891 | { | |
2892 | return m_state_stack.top(); | |
2893 | } | |
2894 | ||
2895 | template <typename TokenT> | |
2896 | inline void | |
2897 | lexer_control<TokenT>::set_token(const TokenT& token) | |
2898 | { | |
2899 | m_token = token; | |
2900 | } | |
2901 | ||
2902 | template <typename TokenT> | |
2903 | inline void | |
2904 | lexer_control<TokenT>::terminate() | |
2905 | { | |
2906 | m_token = -1; // TOOD: fix this, need to be determined by traits | |
2907 | } | |
2908 | ||
2909 | template <typename TokenT> | |
2910 | inline void | |
2911 | lexer_control<TokenT>::ignore_current_token() | |
2912 | { | |
2913 | m_ignore_current_token = true; | |
2914 | } | |
2915 | ||
2916 | template <typename TokenT> | |
2917 | inline bool | |
2918 | lexer_control<TokenT>::ignore_current_token_set() | |
2919 | { | |
2920 | return m_ignore_current_token; | |
2921 | } | |
2922 | ||
2923 | } // namespace classic | |
2924 | } // namespace spirit | |
2925 | } // namespace boost | |
2926 | ||
2927 | #undef BOOST_SPIRIT_IT_NS | |
2928 | ||
2929 | #endif | |
2930 |