]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*============================================================================= |
2 | Boost.Wave: A Standard compliant C++ preprocessor library | |
3 | ||
4 | Spirit based lexer | |
5 | ||
6 | http://www.boost.org/ | |
7 | ||
8 | Copyright (c) 2001, Daniel C. Nuffer. | |
9 | Copyright (c) 2001-2012 Hartmut Kaiser. | |
10 | Distributed under the Boost Software License, Version 1.0. (See accompanying | |
11 | file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
12 | ||
13 | TODO List: | |
14 | X callback objects (called when a match is made.) | |
15 | X callback passed first & last iterator, and | |
16 | a reference to a lexercontrol object that supports some | |
17 | operations on the lexer. | |
18 | set state | |
19 | terminate | |
20 | state stack (push, pop, top) | |
21 | set new token return value | |
22 | ignore the current token | |
23 | yymore | |
24 | get length of matched token | |
25 | get current lexer state | |
26 | X DFA serialization to save recomputing the DFA. | |
27 | ||
28 | lexer states. | |
29 | organize the file into hpp and ipp. arrange the classes in a logical order. | |
30 | use buffering - iterator only needs be an input iterator, | |
31 | lexer & callback are not templatized on iterator type, but char type. | |
32 | next_token is templatized on iterator type. | |
33 | beginning/ending contexts. | |
34 | ^ and $ | |
35 | DFA minimization. | |
36 | DFA table compression. | |
37 | ||
38 | =============================================================================*/ | |
39 | #ifndef BOOST_SPIRIT_LEXER_HPP | |
40 | #define BOOST_SPIRIT_LEXER_HPP | |
41 | ||
42 | /////////////////////////////////////////////////////////////////////////////// | |
43 | #include <boost/throw_exception.hpp> | |
44 | ||
45 | #include <boost/spirit/include/classic_core.hpp> | |
46 | #include <boost/spirit/include/classic_symbols.hpp> | |
47 | #include <boost/spirit/include/classic_chset.hpp> | |
48 | #include <boost/spirit/include/classic_escape_char.hpp> | |
49 | ||
50 | #include <set> | |
51 | #include <map> | |
52 | #include <vector> | |
53 | #include <stack> | |
54 | #include <utility> // for pair | |
55 | #include <iostream> | |
56 | #include <fstream> | |
57 | #include <boost/assert.hpp> | |
58 | #include <boost/limits.hpp> | |
59 | ||
60 | #if defined(BOOST_NO_STD_ITERATOR_TRAITS) | |
61 | #define BOOST_SPIRIT_IT_NS impl | |
62 | #else | |
63 | #define BOOST_SPIRIT_IT_NS std | |
64 | #endif | |
65 | ||
66 | /////////////////////////////////////////////////////////////////////////////// | |
67 | namespace boost { | |
68 | namespace spirit { | |
69 | namespace classic { | |
70 | ||
71 | typedef unsigned char uchar; | |
72 | typedef unsigned int node_id_t; | |
73 | const node_id_t invalid_node = node_id_t(-1); | |
74 | typedef std::set<node_id_t> node_set; | |
75 | typedef std::vector<uchar> uchar_vector; | |
76 | typedef std::map<node_id_t, node_set> followpos_t; | |
77 | typedef std::vector<uchar_vector> state_match_t; | |
78 | ||
79 | template <typename TokenT> | |
80 | class lexer_control; | |
81 | ||
82 | class bad_regex : public std::exception | |
83 | { | |
84 | }; | |
85 | ||
86 | namespace lexerimpl | |
87 | { | |
88 | ||
89 | class node | |
90 | { | |
91 | public: | |
92 | ||
93 | virtual ~node() {} | |
94 | ||
95 | virtual node* clone() const = 0; | |
96 | virtual bool nullable() const = 0; | |
97 | virtual node_set firstpos() const = 0; | |
98 | virtual node_set lastpos() const = 0; | |
99 | virtual void compute_followpos(followpos_t& followpos) const = 0; | |
100 | virtual void compute_state_match(state_match_t& state_match) const = 0; | |
101 | virtual void get_eof_ids(node_set& eof_set) const = 0; | |
102 | virtual void assign_node_ids(node_id_t& node_count) = 0; | |
103 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
104 | virtual void dump(std::ostream& out) const = 0; | |
105 | #endif | |
106 | ||
107 | }; | |
108 | ||
109 | class char_node : public node | |
110 | { | |
111 | ||
112 | public: | |
113 | ||
114 | char_node(const uchar c); | |
115 | char_node(const char_node& x); | |
116 | virtual ~char_node(){} | |
117 | ||
118 | virtual node* clone() const; | |
119 | virtual bool nullable() const; | |
120 | virtual node_set firstpos() const; | |
121 | virtual node_set lastpos() const; | |
122 | virtual void compute_followpos(followpos_t& followpos) const; | |
123 | virtual void compute_state_match(state_match_t& state_match ) const; | |
124 | virtual void get_eof_ids(node_set& eof_set) const; | |
125 | virtual void assign_node_ids(node_id_t& node_count); | |
126 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
127 | virtual void dump(std::ostream& out) const; | |
128 | #endif | |
129 | ||
130 | private: | |
131 | ||
132 | uchar m_char; | |
133 | node_id_t m_node_num; | |
134 | }; | |
135 | ||
136 | inline | |
137 | char_node::char_node(const uchar c) | |
138 | : node() | |
139 | , m_char(c) | |
140 | , m_node_num(0) | |
141 | { | |
142 | } | |
143 | ||
144 | inline | |
145 | char_node::char_node(const char_node& x) | |
146 | : node(x) | |
147 | , m_char(x.m_char) | |
148 | , m_node_num(x.m_node_num) | |
149 | { | |
150 | } | |
151 | ||
152 | inline node * | |
153 | char_node::clone() const | |
154 | { | |
155 | return new char_node(*this); | |
156 | } | |
157 | ||
158 | inline bool | |
159 | char_node::nullable() const | |
160 | { | |
161 | return false; | |
162 | } | |
163 | ||
164 | inline node_set | |
165 | char_node::firstpos() const | |
166 | { | |
167 | node_set rval; | |
168 | rval.insert(m_node_num); | |
169 | return rval; | |
170 | } | |
171 | ||
172 | inline node_set | |
173 | char_node::lastpos() const | |
174 | { | |
175 | return firstpos(); | |
176 | } | |
177 | ||
178 | inline void | |
179 | char_node::compute_followpos(followpos_t&) const | |
180 | { | |
181 | return; | |
182 | } | |
183 | ||
184 | inline void | |
185 | char_node::compute_state_match(state_match_t& state_match) const | |
186 | { | |
187 | if (state_match.size() < m_node_num + 1) | |
188 | state_match.resize(m_node_num + 1); | |
189 | state_match[m_node_num].resize(256); | |
190 | state_match[m_node_num][m_char] = 1; | |
191 | } | |
192 | ||
193 | inline void | |
194 | char_node::get_eof_ids(node_set&) const | |
195 | { | |
196 | return; | |
197 | } | |
198 | ||
199 | inline void | |
200 | char_node::assign_node_ids(node_id_t& node_count) | |
201 | { | |
202 | m_node_num = node_count++; | |
203 | } | |
204 | ||
205 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
206 | inline void | |
207 | char_node::dump(std::ostream& out) const | |
208 | { | |
209 | out << "\nchar_node m_char = " << m_char; | |
210 | out << " m_node_num = " << m_node_num; | |
211 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
212 | out << " firstpos() = "; | |
213 | node_set fp = firstpos(); | |
214 | std::copy(fp.begin(), fp.end(), | |
215 | std::ostream_iterator<node_id_t>(out, ",")); | |
216 | ||
217 | out << " lastpos() = "; | |
218 | node_set lp = lastpos(); | |
219 | std::copy(lp.begin(), lp.end(), | |
220 | std::ostream_iterator<node_id_t>(out, ",")); | |
221 | } | |
222 | #endif | |
223 | ||
224 | ||
225 | class epsilon_node : public node | |
226 | { | |
227 | ||
228 | public: | |
229 | ||
230 | epsilon_node(); | |
231 | epsilon_node(const epsilon_node& x); | |
232 | virtual ~epsilon_node(){} | |
233 | ||
234 | virtual node* clone() const; | |
235 | virtual bool nullable() const; | |
236 | virtual node_set firstpos() const; | |
237 | virtual node_set lastpos() const; | |
238 | virtual void compute_followpos(followpos_t& followpos) const; | |
239 | virtual void compute_state_match(state_match_t& state_match ) const; | |
240 | virtual void get_eof_ids(node_set& eof_set) const; | |
241 | virtual void assign_node_ids(node_id_t& node_count); | |
242 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
243 | virtual void dump(std::ostream& out) const; | |
244 | #endif | |
245 | ||
246 | private: | |
247 | ||
248 | node_id_t m_node_num; | |
249 | }; | |
250 | ||
251 | inline | |
252 | epsilon_node::epsilon_node() | |
253 | : node() | |
254 | , m_node_num(0) | |
255 | { | |
256 | } | |
257 | ||
258 | inline | |
259 | epsilon_node::epsilon_node(const epsilon_node& x) | |
260 | : node(x) | |
261 | , m_node_num(x.m_node_num) | |
262 | { | |
263 | } | |
264 | ||
265 | inline node * | |
266 | epsilon_node::clone() const | |
267 | { | |
268 | return new epsilon_node(*this); | |
269 | } | |
270 | ||
271 | inline bool | |
272 | epsilon_node::nullable() const | |
273 | { | |
274 | return true; | |
275 | } | |
276 | ||
277 | inline node_set | |
278 | epsilon_node::firstpos() const | |
279 | { | |
280 | return node_set(); | |
281 | } | |
282 | ||
283 | inline node_set | |
284 | epsilon_node::lastpos() const | |
285 | { | |
286 | return node_set(); | |
287 | } | |
288 | ||
289 | inline void | |
290 | epsilon_node::compute_followpos(followpos_t&) const | |
291 | { | |
292 | return; | |
293 | } | |
294 | ||
295 | inline void | |
296 | epsilon_node::compute_state_match(state_match_t& state_match) const | |
297 | { | |
298 | if (state_match.size() < m_node_num + 1) | |
299 | state_match.resize(m_node_num + 1); | |
300 | state_match[m_node_num].resize(256, 1); | |
301 | } | |
302 | ||
303 | inline void | |
304 | epsilon_node::get_eof_ids(node_set&) const | |
305 | { | |
306 | return; | |
307 | } | |
308 | ||
309 | inline void | |
310 | epsilon_node::assign_node_ids(node_id_t& node_count) | |
311 | { | |
312 | m_node_num = node_count++; | |
313 | } | |
314 | ||
315 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
316 | inline void | |
317 | epsilon_node::dump(std::ostream& out) const | |
318 | { | |
319 | out << "\nepsilon_node"; | |
320 | out << " m_node_num = " << m_node_num; | |
321 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
322 | out << " firstpos() = "; | |
323 | node_set fp = firstpos(); | |
324 | std::copy(fp.begin(), fp.end(), | |
325 | std::ostream_iterator<node_id_t>(out, ",")); | |
326 | ||
327 | out << " lastpos() = "; | |
328 | node_set lp = lastpos(); | |
329 | std::copy(lp.begin(), lp.end(), | |
330 | std::ostream_iterator<node_id_t>(out, ",")); | |
331 | } | |
332 | #endif | |
333 | ||
334 | ||
335 | class or_node : public node | |
336 | { | |
337 | ||
338 | public: | |
339 | ||
340 | or_node(node* left, node* right); | |
341 | or_node(const or_node& x); | |
342 | virtual ~or_node(){} | |
343 | ||
344 | virtual node* clone() const; | |
345 | virtual bool nullable() const; | |
346 | virtual node_set firstpos() const; | |
347 | virtual node_set lastpos() const; | |
348 | virtual void compute_followpos(followpos_t& followpos) const; | |
349 | virtual void compute_state_match(state_match_t& state_match ) const; | |
350 | virtual void get_eof_ids(node_set& eof_set) const; | |
351 | virtual void assign_node_ids(node_id_t& node_count); | |
352 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
353 | virtual void dump(std::ostream& out) const; | |
354 | #endif | |
355 | ||
356 | private: | |
357 | ||
358 | std::auto_ptr<node> m_left; | |
359 | std::auto_ptr<node> m_right; | |
360 | }; | |
361 | ||
362 | inline | |
363 | or_node::or_node(node* left, node* right) | |
364 | : node() | |
365 | , m_left(left) | |
366 | , m_right(right) | |
367 | { | |
368 | } | |
369 | ||
370 | inline | |
371 | or_node::or_node(const or_node& x) | |
372 | : node(x) | |
373 | , m_left(x.m_left->clone()) | |
374 | , m_right(x.m_right->clone()) | |
375 | { | |
376 | } | |
377 | ||
378 | inline node * | |
379 | or_node::clone() const | |
380 | { | |
381 | return new or_node(m_left->clone(), m_right->clone()); | |
382 | } | |
383 | ||
384 | inline bool | |
385 | or_node::nullable() const | |
386 | { | |
387 | return m_left->nullable() || m_right->nullable(); | |
388 | } | |
389 | ||
390 | inline node_set | |
391 | or_node::firstpos() const | |
392 | { | |
393 | node_set rval; | |
394 | node_set l = m_left->firstpos(); | |
395 | node_set r = m_right->firstpos(); | |
396 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
397 | std::inserter(rval, rval.begin())); | |
398 | return rval; | |
399 | } | |
400 | ||
401 | inline node_set | |
402 | or_node::lastpos() const | |
403 | { | |
404 | node_set rval; | |
405 | node_set l = m_left->lastpos(); | |
406 | node_set r = m_right->lastpos(); | |
407 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
408 | std::inserter(rval, rval.begin())); | |
409 | return rval; | |
410 | } | |
411 | ||
412 | inline void | |
413 | or_node::compute_followpos(followpos_t& followpos) const | |
414 | { | |
415 | m_left->compute_followpos(followpos); | |
416 | m_right->compute_followpos(followpos); | |
417 | } | |
418 | ||
419 | inline void | |
420 | or_node::compute_state_match(state_match_t& state_match) const | |
421 | { | |
422 | m_left->compute_state_match(state_match); | |
423 | m_right->compute_state_match(state_match); | |
424 | } | |
425 | ||
426 | inline void | |
427 | or_node::get_eof_ids(node_set& eof_nodes) const | |
428 | { | |
429 | m_left->get_eof_ids(eof_nodes); | |
430 | m_right->get_eof_ids(eof_nodes); | |
431 | } | |
432 | ||
433 | inline void | |
434 | or_node::assign_node_ids(node_id_t& node_count) | |
435 | { | |
436 | m_left->assign_node_ids(node_count); | |
437 | m_right->assign_node_ids(node_count); | |
438 | } | |
439 | ||
440 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
441 | inline void | |
442 | or_node::dump(std::ostream& out) const | |
443 | { | |
444 | m_left->dump(out); | |
445 | ||
446 | out << "\nor_node"; | |
447 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
448 | out << " firstpos() = "; | |
449 | node_set fp = firstpos(); | |
450 | std::copy(fp.begin(), fp.end(), | |
451 | std::ostream_iterator<node_id_t>(out, ",")); | |
452 | ||
453 | out << " lastpos() = "; | |
454 | node_set lp = lastpos(); | |
455 | std::copy(lp.begin(), lp.end(), | |
456 | std::ostream_iterator<node_id_t>(out, ",")); | |
457 | ||
458 | m_right->dump(out); | |
459 | } | |
460 | #endif | |
461 | ||
462 | ||
463 | class cat_node : public node | |
464 | { | |
465 | ||
466 | public: | |
467 | ||
468 | cat_node(node* left, node* right); | |
469 | cat_node(const cat_node& x); | |
470 | virtual ~cat_node(){} | |
471 | ||
472 | virtual node* clone() const; | |
473 | virtual bool nullable() const; | |
474 | virtual node_set firstpos() const; | |
475 | virtual node_set lastpos() const; | |
476 | virtual void compute_followpos(followpos_t& followpos) const; | |
477 | virtual void compute_state_match(state_match_t& state_match ) const; | |
478 | virtual void get_eof_ids(node_set& eof_set) const; | |
479 | virtual void assign_node_ids(node_id_t& node_count); | |
480 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
481 | virtual void dump(std::ostream& out) const; | |
482 | #endif | |
483 | ||
484 | private: | |
485 | ||
486 | std::auto_ptr<node> m_left; | |
487 | std::auto_ptr<node> m_right; | |
488 | }; | |
489 | ||
490 | inline | |
491 | cat_node::cat_node(node* left, node* right) | |
492 | : node() | |
493 | , m_left(left) | |
494 | , m_right(right) | |
495 | { | |
496 | } | |
497 | ||
498 | inline | |
499 | cat_node::cat_node(const cat_node& x) | |
500 | : node(x) | |
501 | , m_left(x.m_left->clone()) | |
502 | , m_right(x.m_right->clone()) | |
503 | { | |
504 | } | |
505 | ||
506 | inline node * | |
507 | cat_node::clone() const | |
508 | { | |
509 | return new cat_node(m_left->clone(), m_right->clone()); | |
510 | } | |
511 | ||
512 | inline bool | |
513 | cat_node::nullable() const | |
514 | { | |
515 | return m_left->nullable() && m_right->nullable(); | |
516 | } | |
517 | ||
518 | inline node_set | |
519 | cat_node::firstpos() const | |
520 | { | |
521 | if (m_left->nullable()) | |
522 | { | |
523 | node_set rval; | |
524 | node_set l = m_left->firstpos(); | |
525 | node_set r = m_right->firstpos(); | |
526 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
527 | std::inserter(rval, rval.begin())); | |
528 | return rval; | |
529 | } | |
530 | else | |
531 | { | |
532 | return m_left->firstpos(); | |
533 | } | |
534 | } | |
535 | ||
536 | inline node_set | |
537 | cat_node::lastpos() const | |
538 | { | |
539 | if (m_right->nullable()) | |
540 | { | |
541 | node_set rval; | |
542 | node_set l = m_left->lastpos(); | |
543 | node_set r = m_right->lastpos(); | |
544 | std::set_union(l.begin(), l.end(), r.begin(), r.end(), | |
545 | std::inserter(rval, rval.begin())); | |
546 | return rval; | |
547 | } | |
548 | else | |
549 | { | |
550 | return m_right->lastpos(); | |
551 | } | |
552 | } | |
553 | ||
554 | inline void | |
555 | cat_node::compute_followpos(followpos_t& followpos) const | |
556 | { | |
557 | node_set l = m_left->lastpos(); | |
558 | for (node_set::iterator i = l.begin(); | |
559 | i != l.end(); | |
560 | ++i) | |
561 | { | |
562 | node_set rf = m_right->firstpos(); | |
563 | followpos[*i].insert(rf.begin(), rf.end()); | |
564 | } | |
565 | ||
566 | m_left->compute_followpos(followpos); | |
567 | m_right->compute_followpos(followpos); | |
568 | } | |
569 | ||
570 | inline void | |
571 | cat_node::compute_state_match(state_match_t& state_match) const | |
572 | { | |
573 | m_left->compute_state_match(state_match); | |
574 | m_right->compute_state_match(state_match); | |
575 | } | |
576 | ||
577 | inline void | |
578 | cat_node::get_eof_ids(node_set& eof_nodes) const | |
579 | { | |
580 | m_left->get_eof_ids(eof_nodes); | |
581 | m_right->get_eof_ids(eof_nodes); | |
582 | } | |
583 | ||
584 | inline void | |
585 | cat_node::assign_node_ids(node_id_t& node_count) | |
586 | { | |
587 | m_left->assign_node_ids(node_count); | |
588 | m_right->assign_node_ids(node_count); | |
589 | } | |
590 | ||
591 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
592 | inline void | |
593 | cat_node::dump(std::ostream& out) const | |
594 | { | |
595 | m_left->dump(out); | |
596 | ||
597 | out << "\ncat_node"; | |
598 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
599 | out << " firstpos() = "; | |
600 | node_set fp = firstpos(); | |
601 | std::copy(fp.begin(), fp.end(), | |
602 | std::ostream_iterator<node_id_t>(out, ",")); | |
603 | ||
604 | out << " lastpos() = "; | |
605 | node_set lp = lastpos(); | |
606 | std::copy(lp.begin(), lp.end(), | |
607 | std::ostream_iterator<node_id_t>(out, ",")); | |
608 | ||
609 | m_right->dump(out); | |
610 | } | |
611 | #endif | |
612 | ||
613 | ||
614 | class star_node : public node | |
615 | { | |
616 | ||
617 | public: | |
618 | ||
619 | star_node(node* left); | |
620 | star_node(const star_node& x); | |
621 | virtual ~star_node(){} | |
622 | ||
623 | virtual node* clone() const; | |
624 | virtual bool nullable() const; | |
625 | virtual node_set firstpos() const; | |
626 | virtual node_set lastpos() const; | |
627 | virtual void compute_followpos(followpos_t& followpos) const; | |
628 | virtual void compute_state_match(state_match_t& state_match ) const; | |
629 | virtual void get_eof_ids(node_set& eof_set) const; | |
630 | virtual void assign_node_ids(node_id_t& node_count); | |
631 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
632 | virtual void dump(std::ostream& out) const; | |
633 | #endif | |
634 | ||
635 | private: | |
636 | ||
637 | std::auto_ptr<node> m_left; | |
638 | }; | |
639 | ||
640 | inline | |
641 | star_node::star_node(node* left) | |
642 | : node() | |
643 | , m_left(left) | |
644 | { | |
645 | } | |
646 | ||
647 | inline | |
648 | star_node::star_node(const star_node& x) | |
649 | : node(x) | |
650 | , m_left(x.m_left->clone()) | |
651 | { | |
652 | } | |
653 | ||
654 | inline node * | |
655 | star_node::clone() const | |
656 | { | |
657 | return new star_node(m_left->clone()); | |
658 | } | |
659 | ||
660 | inline bool | |
661 | star_node::nullable() const | |
662 | { | |
663 | return true; | |
664 | } | |
665 | ||
666 | inline node_set | |
667 | star_node::firstpos() const | |
668 | { | |
669 | return m_left->firstpos(); | |
670 | } | |
671 | ||
672 | inline node_set | |
673 | star_node::lastpos() const | |
674 | { | |
675 | return m_left->lastpos(); | |
676 | } | |
677 | ||
678 | inline void | |
679 | star_node::compute_followpos(followpos_t& followpos) const | |
680 | { | |
681 | node_set lp = this->lastpos(); | |
682 | for (node_set::iterator i = lp.begin(); | |
683 | i != lp.end(); | |
684 | ++i) | |
685 | { | |
686 | node_set fp = this->firstpos(); | |
687 | followpos[*i].insert(fp.begin(), fp.end()); | |
688 | } | |
689 | ||
690 | m_left->compute_followpos(followpos); | |
691 | } | |
692 | ||
693 | inline void | |
694 | star_node::compute_state_match(state_match_t& state_match) const | |
695 | { | |
696 | m_left->compute_state_match(state_match); | |
697 | } | |
698 | ||
699 | inline void | |
700 | star_node::get_eof_ids(node_set& eof_nodes) const | |
701 | { | |
702 | m_left->get_eof_ids(eof_nodes); | |
703 | } | |
704 | ||
705 | inline void | |
706 | star_node::assign_node_ids(node_id_t& node_count) | |
707 | { | |
708 | m_left->assign_node_ids(node_count); | |
709 | } | |
710 | ||
711 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
712 | inline void | |
713 | star_node::dump(std::ostream& out) const | |
714 | { | |
715 | m_left->dump(out); | |
716 | ||
717 | out << "\nstar_node"; | |
718 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
719 | out << " firstpos() = "; | |
720 | node_set fp = firstpos(); | |
721 | std::copy(fp.begin(), fp.end(), | |
722 | std::ostream_iterator<node_id_t>(out, ",")); | |
723 | ||
724 | out << " lastpos() = "; | |
725 | node_set lp = lastpos(); | |
726 | std::copy(lp.begin(), lp.end(), | |
727 | std::ostream_iterator<node_id_t>(out, ",")); | |
728 | ||
729 | } | |
730 | #endif | |
731 | ||
732 | ||
733 | class eof_node : public node | |
734 | { | |
735 | ||
736 | public: | |
737 | ||
738 | eof_node(); | |
739 | eof_node(const eof_node& x); | |
740 | virtual ~eof_node(){} | |
741 | ||
742 | virtual node* clone() const; | |
743 | virtual bool nullable() const; | |
744 | virtual node_set firstpos() const; | |
745 | virtual node_set lastpos() const; | |
746 | virtual void compute_followpos(followpos_t& followpos) const; | |
747 | virtual void compute_state_match(state_match_t& state_match ) const; | |
748 | virtual void get_eof_ids(node_set& eof_set) const; | |
749 | virtual void assign_node_ids(node_id_t& node_count); | |
750 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
751 | virtual void dump(std::ostream& out) const; | |
752 | #endif | |
753 | ||
754 | private: | |
755 | ||
756 | node_id_t m_node_num; | |
757 | }; | |
758 | ||
759 | inline | |
760 | eof_node::eof_node() | |
761 | : node() | |
762 | , m_node_num(0) | |
763 | { | |
764 | } | |
765 | ||
766 | inline | |
767 | eof_node::eof_node(const eof_node& x) | |
768 | : node(x) | |
769 | , m_node_num(x.m_node_num) | |
770 | { | |
771 | } | |
772 | ||
773 | inline node * | |
774 | eof_node::clone() const | |
775 | { | |
776 | return new eof_node(*this); | |
777 | } | |
778 | ||
779 | inline bool | |
780 | eof_node::nullable() const | |
781 | { | |
782 | return false; | |
783 | } | |
784 | ||
785 | inline node_set | |
786 | eof_node::firstpos() const | |
787 | { | |
788 | node_set rval; | |
789 | rval.insert(m_node_num); | |
790 | return rval; | |
791 | } | |
792 | ||
793 | inline node_set | |
794 | eof_node::lastpos() const | |
795 | { | |
796 | node_set rval; | |
797 | rval.insert(m_node_num); | |
798 | return rval; | |
799 | } | |
800 | ||
801 | inline void | |
802 | eof_node::compute_followpos(followpos_t&) const | |
803 | { | |
804 | return; | |
805 | } | |
806 | ||
807 | inline void | |
808 | eof_node::compute_state_match(state_match_t& state_match) const | |
809 | { | |
810 | if (state_match.size() < m_node_num + 1) | |
811 | state_match.resize(m_node_num + 1); | |
812 | state_match[m_node_num].resize(256, 0); | |
813 | } | |
814 | ||
815 | inline void | |
816 | eof_node::get_eof_ids(node_set& eof_nodes) const | |
817 | { | |
818 | eof_nodes.insert(m_node_num); | |
819 | } | |
820 | ||
821 | inline void | |
822 | eof_node::assign_node_ids(node_id_t& node_count) | |
823 | { | |
824 | m_node_num = node_count++; | |
825 | } | |
826 | ||
827 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
828 | inline void | |
829 | eof_node::dump(std::ostream& out) const | |
830 | { | |
831 | out << "\neof_node"; | |
832 | out << " m_node_num = " << m_node_num; | |
833 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
834 | out << " firstpos() = "; | |
835 | node_set fp = firstpos(); | |
836 | std::copy(fp.begin(), fp.end(), | |
837 | std::ostream_iterator<node_id_t>(out, ",")); | |
838 | ||
839 | out << " lastpos() = "; | |
840 | node_set lp = lastpos(); | |
841 | std::copy(lp.begin(), lp.end(), | |
842 | std::ostream_iterator<node_id_t>(out, ",")); | |
843 | } | |
844 | #endif | |
845 | ||
846 | class ccl_node : public node | |
847 | { | |
848 | ||
849 | public: | |
850 | ||
851 | ccl_node(const std::vector<uchar>& v); | |
852 | ccl_node(const uchar c1, const uchar c2); | |
853 | ccl_node(const ccl_node& x); | |
854 | virtual ~ccl_node(){} | |
855 | ||
856 | virtual node* clone() const; | |
857 | virtual bool nullable() const; | |
858 | virtual node_set firstpos() const; | |
859 | virtual node_set lastpos() const; | |
860 | virtual void compute_followpos(followpos_t& followpos) const; | |
861 | virtual void compute_state_match(state_match_t& state_match ) const; | |
862 | virtual void get_eof_ids(node_set& eof_set) const; | |
863 | virtual void assign_node_ids(node_id_t& node_count); | |
864 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
865 | virtual void dump(std::ostream& out) const; | |
866 | #endif | |
867 | ||
868 | private: | |
869 | ||
870 | std::vector<uchar> m_match; | |
871 | node_id_t m_node_num; | |
872 | }; | |
873 | ||
874 | inline | |
875 | ccl_node::ccl_node(const std::vector<uchar>& v) | |
876 | : node() | |
877 | , m_match(v) | |
878 | , m_node_num(0) | |
879 | { | |
880 | m_match.resize(256); // make sure it's the right size | |
881 | } | |
882 | ||
883 | inline | |
884 | ccl_node::ccl_node(const uchar c1, const uchar c2) | |
885 | : node() | |
886 | , m_match(256, uchar(0)) | |
887 | , m_node_num(0) | |
888 | { | |
889 | BOOST_ASSERT(c1 < c2); | |
890 | for (std::size_t i = c1; i <= std::size_t(c2); ++i) | |
891 | { | |
892 | m_match[i] = 1; | |
893 | } | |
894 | } | |
895 | ||
896 | inline | |
897 | ccl_node::ccl_node(const ccl_node& x) | |
898 | : node(x) | |
899 | , m_match(x.m_match) | |
900 | , m_node_num(x.m_node_num) | |
901 | { | |
902 | } | |
903 | ||
904 | inline node * | |
905 | ccl_node::clone() const | |
906 | { | |
907 | return new ccl_node(*this); | |
908 | } | |
909 | ||
910 | inline bool | |
911 | ccl_node::nullable() const | |
912 | { | |
913 | return false; | |
914 | } | |
915 | ||
916 | inline node_set | |
917 | ccl_node::firstpos() const | |
918 | { | |
919 | node_set rval; | |
920 | rval.insert(m_node_num); | |
921 | return rval; | |
922 | } | |
923 | ||
924 | inline node_set | |
925 | ccl_node::lastpos() const | |
926 | { | |
927 | return firstpos(); | |
928 | } | |
929 | ||
930 | inline void | |
931 | ccl_node::compute_followpos(followpos_t&) const | |
932 | { | |
933 | return; | |
934 | } | |
935 | ||
936 | inline void | |
937 | ccl_node::compute_state_match(state_match_t& state_match) const | |
938 | { | |
939 | if (state_match.size() < m_node_num + 1) | |
940 | state_match.resize(m_node_num + 1); | |
941 | state_match[m_node_num] = m_match; | |
942 | } | |
943 | ||
944 | inline void | |
945 | ccl_node::get_eof_ids(node_set&) const | |
946 | { | |
947 | return; | |
948 | } | |
949 | ||
950 | inline void | |
951 | ccl_node::assign_node_ids(node_id_t& node_count) | |
952 | { | |
953 | m_node_num = node_count++; | |
954 | } | |
955 | ||
956 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
957 | inline void | |
958 | ccl_node::dump(std::ostream& out) const | |
959 | { | |
960 | out << "\nccl_node m_match = "; | |
961 | for (std::size_t i = 0; i < m_match.size(); ++i) | |
962 | { | |
963 | if (m_match[i]) | |
964 | out << i << ", "; | |
965 | } | |
966 | out << " m_node_num = " << m_node_num; | |
967 | out << " nullable() = " << (nullable() ? "true" : "false"); | |
968 | out << " firstpos() = "; | |
969 | node_set fp = firstpos(); | |
970 | std::copy(fp.begin(), fp.end(), | |
971 | std::ostream_iterator<node_id_t>(out, ",")); | |
972 | ||
973 | out << " lastpos() = "; | |
974 | node_set lp = lastpos(); | |
975 | std::copy(lp.begin(), lp.end(), | |
976 | std::ostream_iterator<node_id_t>(out, ",")); | |
977 | } | |
978 | #endif | |
979 | ||
980 | template <typename ScannerT> | |
981 | class make_concat | |
982 | { | |
983 | typedef typename ScannerT::iterator_t iterator_type; | |
984 | ||
985 | public: | |
986 | ||
987 | make_concat(std::stack<node*>& the_stack) | |
988 | : m_stack(the_stack) | |
989 | {} | |
990 | ||
991 | void operator()(iterator_type const &, iterator_type const &) const | |
992 | { | |
993 | node* right = m_stack.top(); | |
994 | m_stack.pop(); | |
995 | node* left = m_stack.top(); | |
996 | m_stack.pop(); | |
997 | node* newnode = new cat_node(left, right); | |
998 | m_stack.push(newnode); | |
999 | } | |
1000 | ||
1001 | std::stack<node*>& m_stack; | |
1002 | }; | |
1003 | ||
1004 | template <int CharTSize> | |
1005 | struct get_byte_aux; | |
1006 | ||
1007 | template<> | |
1008 | struct get_byte_aux<1> | |
1009 | { | |
1010 | template <typename CharT> | |
1011 | unsigned char operator()(CharT c, unsigned int byte) | |
1012 | { | |
1013 | BOOST_ASSERT(byte == 0); | |
1014 | return c; | |
1015 | } | |
1016 | }; | |
1017 | ||
1018 | template<> | |
1019 | struct get_byte_aux<2> | |
1020 | { | |
1021 | template <typename CharT> | |
1022 | unsigned char operator()(CharT c, unsigned int byte) | |
1023 | { | |
1024 | static unsigned long mask[] = | |
1025 | { | |
1026 | 0xFF00, | |
1027 | 0x00FF | |
1028 | }; | |
1029 | ||
1030 | BOOST_ASSERT(byte < 2); | |
1031 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1032 | } | |
1033 | }; | |
1034 | ||
1035 | template<> | |
1036 | struct get_byte_aux<4> | |
1037 | { | |
1038 | template <typename CharT> | |
1039 | unsigned char operator()(CharT c, unsigned int byte) | |
1040 | { | |
1041 | static unsigned long mask[] = | |
1042 | { | |
1043 | 0xFF000000, | |
1044 | 0x00FF0000, | |
1045 | 0x0000FF00, | |
1046 | 0x000000FF | |
1047 | }; | |
1048 | ||
1049 | BOOST_ASSERT(byte < 4); | |
1050 | return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); | |
1051 | } | |
1052 | }; | |
1053 | ||
1054 | template <typename CharT> | |
1055 | inline unsigned char | |
1056 | get_byte(CharT c, unsigned int byte) | |
1057 | { | |
1058 | return get_byte_aux<sizeof(c)>()(c, byte); | |
1059 | } | |
1060 | ||
1061 | template <typename ScannerT> | |
1062 | class make_star | |
1063 | { | |
1064 | typedef typename ScannerT::iterator_t iterator_type; | |
1065 | ||
1066 | public: | |
1067 | typedef | |
1068 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1069 | char_t; | |
1070 | ||
1071 | make_star(std::stack<node*>& the_stack) | |
1072 | : m_stack(the_stack) | |
1073 | {} | |
1074 | ||
1075 | void operator()(const char_t) const | |
1076 | { | |
1077 | node* left = m_stack.top(); | |
1078 | m_stack.pop(); | |
1079 | node* newnode = new star_node(left); | |
1080 | m_stack.push(newnode); | |
1081 | } | |
1082 | ||
1083 | std::stack<node*>& m_stack; | |
1084 | }; | |
1085 | ||
1086 | template <typename ScannerT> | |
1087 | class make_or | |
1088 | { | |
1089 | typedef typename ScannerT::iterator_t iterator_type; | |
1090 | ||
1091 | public: | |
1092 | ||
1093 | make_or(std::stack<node*>& the_stack) | |
1094 | : m_stack(the_stack) | |
1095 | {} | |
1096 | ||
1097 | void operator()(iterator_type const&, iterator_type const&) const | |
1098 | { | |
1099 | node* right = m_stack.top(); | |
1100 | m_stack.pop(); | |
1101 | node* left = m_stack.top(); | |
1102 | m_stack.pop(); | |
1103 | node* newnode = new or_node(left, right); | |
1104 | m_stack.push(newnode); | |
1105 | } | |
1106 | ||
1107 | std::stack<node*>& m_stack; | |
1108 | }; | |
1109 | ||
1110 | template <typename ScannerT> | |
1111 | class make_plus | |
1112 | { | |
1113 | typedef typename ScannerT::iterator_t iterator_type; | |
1114 | ||
1115 | public: | |
1116 | typedef | |
1117 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1118 | char_t; | |
1119 | ||
1120 | make_plus(std::stack<node*>& the_stack) | |
1121 | : m_stack(the_stack) | |
1122 | {} | |
1123 | ||
1124 | void operator()(const char_t) const | |
1125 | { | |
1126 | node* left = m_stack.top(); | |
1127 | m_stack.pop(); | |
1128 | ||
1129 | node* copy = left->clone(); | |
1130 | ||
1131 | node* new_star = new star_node(copy); | |
1132 | node* new_cat = new cat_node(left, new_star); | |
1133 | m_stack.push(new_cat); | |
1134 | } | |
1135 | ||
1136 | std::stack<node*>& m_stack; | |
1137 | }; | |
1138 | ||
1139 | template <typename ScannerT> | |
1140 | class make_optional | |
1141 | { | |
1142 | typedef typename ScannerT::iterator_t iterator_type; | |
1143 | ||
1144 | public: | |
1145 | typedef | |
1146 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1147 | char_t; | |
1148 | ||
1149 | make_optional(std::stack<node*>& the_stack) | |
1150 | : m_stack(the_stack) | |
1151 | {} | |
1152 | ||
1153 | void operator()(const char_t) const | |
1154 | { | |
1155 | node* left = m_stack.top(); | |
1156 | m_stack.pop(); | |
1157 | ||
1158 | node* new_or = new or_node(left, new epsilon_node()); | |
1159 | m_stack.push(new_or); | |
1160 | } | |
1161 | ||
1162 | std::stack<node*>& m_stack; | |
1163 | }; | |
1164 | ||
1165 | /////////////////////////////////////////////////////////////////////////////// | |
1166 | // utility function | |
1167 | template <typename CharT> | |
1168 | inline utility::impl::range<CharT> const& | |
1169 | full_range() | |
1170 | { | |
1171 | static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(), | |
1172 | (std::numeric_limits<CharT>::max)()); | |
1173 | return full; | |
1174 | } | |
1175 | ||
1176 | namespace ccl_utils | |
1177 | { | |
1178 | template <typename char_t> | |
1179 | inline utility::impl::range_run<char_t> | |
1180 | negate_range_run( | |
1181 | const utility::impl::range_run<char_t>& rr) | |
1182 | { | |
1183 | utility::impl::range_run<char_t> newrr; | |
1184 | newrr.set(full_range<char_t>()); | |
1185 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1186 | iter != rr.end(); ++iter) | |
1187 | newrr.clear(*iter); | |
1188 | return newrr; | |
1189 | } | |
1190 | ||
1191 | template <typename char_t> | |
1192 | inline node* | |
1193 | create_mb_node_seq(char_t c) | |
1194 | { | |
1195 | node* newnode = new char_node(get_byte(c, 0)); | |
1196 | for (unsigned int i = 1; i < sizeof(c); ++i) | |
1197 | { | |
1198 | node* cnode = new char_node(get_byte(c, i)); | |
1199 | node* top_node = new cat_node(newnode, cnode); | |
1200 | newnode = top_node; | |
1201 | } | |
1202 | return newnode; | |
1203 | } | |
1204 | ||
1205 | template <typename char_t> | |
1206 | inline void | |
1207 | handle_mb_char(char_t c, bool first_time, | |
1208 | std::stack<node*>& stack) | |
1209 | { | |
1210 | node* newnode = create_mb_node_seq(c); | |
1211 | ||
1212 | if (first_time) | |
1213 | { | |
1214 | stack.push(newnode); | |
1215 | } | |
1216 | else | |
1217 | { | |
1218 | node* top = stack.top(); | |
1219 | stack.pop(); | |
1220 | ||
1221 | node* newtop = new or_node(top, newnode); | |
1222 | stack.push(newtop); | |
1223 | } | |
1224 | } | |
1225 | ||
1226 | // forward decl only | |
1227 | template <typename char_t> | |
1228 | inline void | |
1229 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1230 | std::stack<node*>& stack); | |
1231 | ||
1232 | template <typename char_t> | |
1233 | inline void | |
1234 | create_nodes(const utility::impl::range_run<char_t>& rr, | |
1235 | std::stack<node*>& stack) | |
1236 | { | |
1237 | ||
1238 | if (sizeof(char_t) == 1) | |
1239 | { | |
1240 | std::vector<uchar> ccl; | |
1241 | ccl.resize(256); | |
1242 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1243 | iter != rr.end(); ++iter) | |
1244 | { | |
1245 | for (int i = iter->first; i <= iter->last; ++i) | |
1246 | { | |
1247 | // this is always true because of the limited datatype | |
1248 | // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256); | |
1249 | ccl[uchar(i)] = 1; | |
1250 | } | |
1251 | } | |
1252 | ||
1253 | node* new_ccl = new ccl_node(ccl); | |
1254 | stack.push(new_ccl); | |
1255 | } | |
1256 | else | |
1257 | { | |
1258 | bool mb_first_time = true; | |
1259 | for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); | |
1260 | iter != rr.end(); ++iter) | |
1261 | { | |
1262 | if (iter->first == iter->last) | |
1263 | { | |
1264 | handle_mb_char(iter->first, mb_first_time, stack); | |
1265 | } | |
1266 | else | |
1267 | { | |
1268 | handle_mb_range(iter->first, iter->last, mb_first_time, stack); | |
1269 | } | |
1270 | mb_first_time = false; | |
1271 | } | |
1272 | } | |
1273 | } | |
1274 | ||
1275 | template <typename char_t> | |
1276 | inline std::size_t | |
1277 | compute_differing_byte(char_t c1, char_t c2) | |
1278 | { | |
1279 | std::size_t rval = 0; | |
1280 | while (rval < sizeof(c1) && | |
1281 | get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval)) | |
1282 | { | |
1283 | ++rval; | |
1284 | } | |
1285 | return rval; | |
1286 | } | |
1287 | ||
1288 | template <typename char_t> | |
1289 | inline node* | |
1290 | create_mb_node_type1(std::size_t j, char_t c1, char_t c2) | |
1291 | { | |
1292 | std::size_t diff = get_byte(c2, (unsigned int)j) - | |
1293 | get_byte(c1, (unsigned int)j); | |
1294 | if (diff == 1) { | |
1295 | return 0; | |
1296 | } | |
1297 | else if (diff == 2) { | |
1298 | return new char_node(get_byte(c1, (unsigned int)j)+1); | |
1299 | } | |
1300 | else { | |
1301 | return new ccl_node(get_byte(c1, (unsigned int)j)+1, | |
1302 | get_byte(c2, (unsigned int)j)-1); | |
1303 | } | |
1304 | } | |
1305 | ||
1306 | template <typename char_t> | |
1307 | inline node * | |
1308 | create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1, | |
1309 | std::size_t differing_byte, char_t c1, char_t c2, node* newnode) | |
1310 | { | |
1311 | node* cnode; | |
1312 | if (i == sizem1 && j == differing_byte && j != sizem1) | |
1313 | { | |
1314 | node* tmp = create_mb_node_type1(j, c1, c2); | |
1315 | if (tmp == 0) | |
1316 | { | |
1317 | delete newnode; | |
1318 | return 0; | |
1319 | } | |
1320 | else | |
1321 | cnode = tmp; | |
1322 | } | |
1323 | else if (i == differing_byte && j == sizem1) | |
1324 | { | |
1325 | if (i != sizem1) { | |
1326 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF); | |
1327 | } | |
1328 | else { | |
1329 | cnode = new ccl_node(get_byte(c1, (unsigned int)j), | |
1330 | get_byte(c2, (unsigned int)j)); | |
1331 | } | |
1332 | } | |
1333 | else if (i != differing_byte && i != sizem1 && | |
1334 | j == (sizem1 - i + differing_byte)) | |
1335 | { | |
1336 | cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF); | |
1337 | } | |
1338 | else if (i + j - differing_byte > sizem1) { | |
1339 | cnode = new ccl_node(0, 0xFF); | |
1340 | } | |
1341 | else {//if (is plain) | |
1342 | cnode = new char_node(get_byte(c1, (unsigned int)j)); | |
1343 | } | |
1344 | ||
1345 | node* top_node = new cat_node(newnode, cnode); | |
1346 | return top_node; | |
1347 | } | |
1348 | ||
1349 | // On platforms, where wchar_t is a typedef for unsigned short, the | |
1350 | // comparision for a negative value is pointless | |
1351 | template <bool is_signed> | |
1352 | struct correct_char_aux { | |
1353 | }; | |
1354 | ||
1355 | template <> | |
1356 | struct correct_char_aux<true> { | |
1357 | ||
1358 | template <typename char_t> | |
1359 | static char_t correct(char_t c) { if (c < 0) c = 0; return c; } | |
1360 | }; | |
1361 | ||
1362 | template <> | |
1363 | struct correct_char_aux<false> { | |
1364 | ||
1365 | template <typename char_t> | |
1366 | static char_t correct(char_t c) { return c; } | |
1367 | }; | |
1368 | ||
1369 | template <typename char_t> | |
1370 | struct correct_char | |
1371 | { | |
1372 | static char_t correct(char_t c) | |
1373 | { | |
1374 | return correct_char_aux<std::numeric_limits<char_t>::is_signed >:: | |
1375 | correct(c); | |
1376 | } | |
1377 | }; | |
1378 | ||
1379 | template <typename char_t> | |
1380 | inline void | |
1381 | handle_mb_range(char_t c1, char_t c2, bool first_time, | |
1382 | std::stack<node*>& stack) | |
1383 | { | |
1384 | // The algorithm can't handle negative value chars, which don't make | |
1385 | // much sense anyway. This comparision is pointless for wchar_t's on | |
1386 | // platforms, where wchar_t is a typedef for unsigned short | |
1387 | ||
1388 | c1 = correct_char<char_t>::correct(c1); | |
1389 | //if (c1 < 0) | |
1390 | // c1 = 0; | |
1391 | ||
1392 | BOOST_ASSERT(c1 < c2); | |
1393 | node* newnode = 0; | |
1394 | node* savednode = 0; | |
1395 | const std::size_t differing_byte = compute_differing_byte(c1, c2); | |
1396 | const std::size_t sizem1 = sizeof(c1) - 1; | |
1397 | const std::size_t ndb = sizem1 - differing_byte; | |
1398 | for (std::size_t i = differing_byte; i < sizeof(c1); ++i) | |
1399 | { | |
1400 | // generate node for the first byte | |
1401 | if (differing_byte == 0 && i == ndb) | |
1402 | { | |
1403 | node* tmp = create_mb_node_type1(0, c1, c2); | |
1404 | if (tmp == 0) | |
1405 | continue; | |
1406 | else | |
1407 | newnode = tmp; | |
1408 | } | |
1409 | else | |
1410 | { | |
1411 | newnode = new char_node(get_byte(c1, 0)); | |
1412 | } | |
1413 | for (std::size_t j = 1; j < sizeof(c1); ++j) | |
1414 | { | |
1415 | newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, | |
1416 | c1, c2, newnode); | |
1417 | if (newnode == 0) | |
1418 | goto end_outer_for; | |
1419 | } | |
1420 | ||
1421 | // or together the various parts | |
1422 | if (savednode) | |
1423 | { | |
1424 | node* top_node = new or_node(savednode, newnode); | |
1425 | savednode = top_node; | |
1426 | } | |
1427 | else | |
1428 | { | |
1429 | savednode = newnode; | |
1430 | } | |
1431 | end_outer_for: | |
1432 | continue; | |
1433 | } | |
1434 | ||
1435 | for (std::size_t k = 0; k < ndb; ++k) | |
1436 | { | |
1437 | newnode = new char_node(get_byte(c2, 0)); | |
1438 | for (std::size_t j = 1; j < sizeof(c2); ++j) | |
1439 | { | |
1440 | node* cnode; | |
1441 | if (k == differing_byte && j == sizem1 && k != sizem1) | |
1442 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)); | |
1443 | ||
1444 | else if (k != differing_byte && k != sizem1 && | |
1445 | j == (sizem1 - k + differing_byte)) | |
1446 | cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1); | |
1447 | ||
1448 | else if (k + j - differing_byte > sizem1) | |
1449 | cnode = new ccl_node(0, 0xFF); | |
1450 | ||
1451 | else //if (is plain) | |
1452 | cnode = new char_node(get_byte(c2, (unsigned int)j)); | |
1453 | ||
1454 | ||
1455 | node* top_node = new cat_node(newnode, cnode); | |
1456 | newnode = top_node; | |
1457 | } | |
1458 | ||
1459 | // or together the various parts | |
1460 | if (savednode) | |
1461 | { | |
1462 | node* top_node = new or_node(savednode, newnode); | |
1463 | savednode = top_node; | |
1464 | } | |
1465 | else | |
1466 | { | |
1467 | savednode = newnode; | |
1468 | } | |
1469 | } | |
1470 | ||
1471 | ||
1472 | if (first_time) | |
1473 | { | |
1474 | stack.push(savednode); | |
1475 | } | |
1476 | else | |
1477 | { | |
1478 | node* top = stack.top(); | |
1479 | stack.pop(); | |
1480 | ||
1481 | node* newtop = new or_node(top, savednode); | |
1482 | stack.push(newtop); | |
1483 | } | |
1484 | } | |
1485 | } // namespace ccl_utils | |
1486 | ||
1487 | template <typename ScannerT> | |
1488 | class make_char | |
1489 | { | |
1490 | typedef typename ScannerT::iterator_t iterator_type; | |
1491 | ||
1492 | public: | |
1493 | typedef | |
1494 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1495 | char_t; | |
1496 | ||
1497 | make_char(std::stack<node*>& the_stack) | |
1498 | : m_stack(the_stack) | |
1499 | {} | |
1500 | ||
1501 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1502 | { | |
1503 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1504 | escape_char_parser<lex_escapes, char_t>(); | |
1505 | char_t the_char; | |
1506 | iterator_type first_ = first; | |
1507 | ScannerT scan(first_, last); | |
1508 | lex_escape_ch[assign(the_char)].parse(scan); | |
1509 | node* newnode = ccl_utils::create_mb_node_seq(the_char); | |
1510 | m_stack.push(newnode); | |
1511 | } | |
1512 | ||
1513 | std::stack<node*>& m_stack; | |
1514 | }; | |
1515 | ||
1516 | ||
1517 | template <typename ScannerT> | |
1518 | class make_ccl | |
1519 | { | |
1520 | typedef typename ScannerT::iterator_t iterator_type; | |
1521 | ||
1522 | public: | |
1523 | typedef | |
1524 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1525 | char_t; | |
1526 | ||
1527 | make_ccl(std::stack<node*>& the_stack) | |
1528 | : m_stack(the_stack) | |
1529 | {} | |
1530 | ||
1531 | static bool is_equal_to_string(iterator_type first, | |
1532 | iterator_type const & last, const char* str) | |
1533 | { | |
1534 | while (first != last &&*str &&*first ==*str) | |
1535 | { | |
1536 | ++first; | |
1537 | ++str; | |
1538 | } | |
1539 | return*str == 0; | |
1540 | } | |
1541 | ||
1542 | template <typename ParserT> | |
1543 | static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser) | |
1544 | { | |
1545 | for (int i = 0; i < 256; ++i) | |
1546 | { | |
1547 | if (parser.test(static_cast<char_t>(uchar(i)))) | |
1548 | rr.set(utility::impl::range<char_t>(char_t(i), char_t(i))); | |
1549 | } | |
1550 | } | |
1551 | ||
1552 | void operator()(iterator_type const& first_, iterator_type const& last) const | |
1553 | { | |
1554 | BOOST_ASSERT(*first_ == '['); | |
1555 | ||
1556 | iterator_type first = first_; | |
1557 | ++first; // skip over '[' | |
1558 | bool negated_ccl = false; | |
1559 | if (*first == '^') | |
1560 | { | |
1561 | negated_ccl = true; | |
1562 | ++first; | |
1563 | } | |
1564 | ||
1565 | utility::impl::range_run<char_t> rr; | |
1566 | while (first != last &&*first != ']') | |
1567 | { | |
1568 | if (*first == '[') // it's a ccl_expr like [:space:] | |
1569 | { | |
1570 | // check for [:space:], etc. | |
1571 | if (is_equal_to_string(first, last, "[:alnum:]")) | |
1572 | { | |
1573 | fill_ccl(rr, alnum_p); | |
1574 | } | |
1575 | else if (is_equal_to_string(first, last, "[:alpha:]")) | |
1576 | { | |
1577 | fill_ccl(rr, alpha_p); | |
1578 | } | |
1579 | else if (is_equal_to_string(first, last, "[:blank:]")) | |
1580 | { | |
1581 | fill_ccl(rr, blank_p); | |
1582 | } | |
1583 | else if (is_equal_to_string(first, last, "[:cntrl:]")) | |
1584 | { | |
1585 | fill_ccl(rr, cntrl_p); | |
1586 | } | |
1587 | else if (is_equal_to_string(first, last, "[:digit:]")) | |
1588 | { | |
1589 | fill_ccl(rr, digit_p); | |
1590 | } | |
1591 | else if (is_equal_to_string(first, last, "[:graph:]")) | |
1592 | { | |
1593 | fill_ccl(rr, graph_p); | |
1594 | } | |
1595 | else if (is_equal_to_string(first, last, "[:lower:]")) | |
1596 | { | |
1597 | fill_ccl(rr, lower_p); | |
1598 | } | |
1599 | else if (is_equal_to_string(first, last, "[:print:]")) | |
1600 | { | |
1601 | fill_ccl(rr, print_p); | |
1602 | } | |
1603 | else if (is_equal_to_string(first, last, "[:punct:]")) | |
1604 | { | |
1605 | fill_ccl(rr, punct_p); | |
1606 | } | |
1607 | else if (is_equal_to_string(first, last, "[:space:]")) | |
1608 | { | |
1609 | fill_ccl(rr, space_p); | |
1610 | } | |
1611 | else if (is_equal_to_string(first, last, "[:upper:]")) | |
1612 | { | |
1613 | fill_ccl(rr, upper_p); | |
1614 | } | |
1615 | else if (is_equal_to_string(first, last, "[:xdigit:]")) | |
1616 | { | |
1617 | fill_ccl(rr, xdigit_p); | |
1618 | } | |
1619 | // this can't happen, because it's parsed before we get here. | |
1620 | //else | |
1621 | // throw bad_regex(); | |
1622 | ||
1623 | // Advance past the character class expression | |
1624 | while (first != last &&*first != ']') | |
1625 | ++first; | |
1626 | BOOST_ASSERT(*first == ']'); | |
1627 | ++first; | |
1628 | } | |
1629 | else { | |
1630 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1631 | escape_char_parser<lex_escapes, char_t>(); | |
1632 | ||
1633 | char_t c1; | |
1634 | ScannerT scan(first, last); | |
1635 | lex_escape_ch[assign(c1)].parse(scan); | |
1636 | if (*scan.first == '-') // insert a range | |
1637 | { | |
1638 | ++scan.first; | |
1639 | char_t c2; | |
1640 | lex_escape_ch[assign(c2)].parse(scan); | |
1641 | BOOST_ASSERT(c1 < c2); // Throw exception? | |
1642 | rr.set(utility::impl::range<char_t>(c1, c2)); | |
1643 | } | |
1644 | else // insert 1 char | |
1645 | { | |
1646 | rr.set(utility::impl::range<char_t>(c1, c1)); | |
1647 | } | |
1648 | } | |
1649 | } | |
1650 | ||
1651 | if (negated_ccl) | |
1652 | { | |
1653 | rr = ccl_utils::negate_range_run(rr); | |
1654 | } | |
1655 | ||
1656 | ccl_utils::create_nodes(rr, m_stack); | |
1657 | } | |
1658 | ||
1659 | std::stack<node*>& m_stack; | |
1660 | }; | |
1661 | ||
1662 | template <typename ScannerT> | |
1663 | class make_any_char | |
1664 | { | |
1665 | typedef typename ScannerT::iterator_t iterator_type; | |
1666 | ||
1667 | public: | |
1668 | typedef | |
1669 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1670 | char_t; | |
1671 | ||
1672 | std::stack<node*>& m_stack; | |
1673 | ||
1674 | make_any_char(std::stack<node*>& the_stack) | |
1675 | : m_stack(the_stack) | |
1676 | {} | |
1677 | ||
1678 | void operator()(const char_t c) const | |
1679 | { | |
1680 | BOOST_ASSERT(c == '.'); | |
1681 | do_any_char(); | |
1682 | } | |
1683 | ||
1684 | void do_any_char() const | |
1685 | { | |
1686 | static utility::impl::range_run<char_t> rr; | |
1687 | rr.set(full_range<char_t>()); | |
1688 | char_t newline = '\n'; | |
1689 | rr.clear(utility::impl::range<char_t>(newline, newline)); | |
1690 | ||
1691 | ccl_utils::create_nodes(rr, m_stack); | |
1692 | } | |
1693 | }; | |
1694 | ||
1695 | template <typename ScannerT> | |
1696 | class make_string | |
1697 | { | |
1698 | typedef typename ScannerT::iterator_t iterator_type; | |
1699 | ||
1700 | public: | |
1701 | typedef | |
1702 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1703 | char_t; | |
1704 | ||
1705 | std::stack<node*>& m_stack; | |
1706 | ||
1707 | make_string(std::stack<node*>& the_stack) | |
1708 | : m_stack(the_stack) | |
1709 | {} | |
1710 | ||
1711 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1712 | { | |
1713 | BOOST_ASSERT(*first == '"'); | |
1714 | ||
1715 | iterator_type first_ = first; | |
1716 | ScannerT scan(first_, last); | |
1717 | ++scan.first; // skip over '"' | |
1718 | ||
1719 | // empty string not allowed | |
1720 | if (*scan.first == '"') | |
1721 | { | |
1722 | boost::throw_exception(bad_regex()); | |
1723 | } | |
1724 | ||
1725 | const escape_char_parser<lex_escapes, char_t> lex_escape_ch = | |
1726 | escape_char_parser<lex_escapes, char_t>(); | |
1727 | ||
1728 | char_t c; | |
1729 | lex_escape_ch[assign(c)].parse(scan); | |
1730 | node* top_node = ccl_utils::create_mb_node_seq(c); | |
1731 | ||
1732 | while (*scan.first != '"' && scan.first != scan.last) | |
1733 | { | |
1734 | lex_escape_ch[assign(c)].parse(scan); | |
1735 | node* cur_node = ccl_utils::create_mb_node_seq(c); | |
1736 | top_node = new cat_node(top_node, cur_node); | |
1737 | } | |
1738 | m_stack.push(top_node); | |
1739 | } | |
1740 | }; | |
1741 | ||
1742 | inline | |
1743 | node* repeat_node(node* n, int num) | |
1744 | { | |
1745 | node* list_of_nodes = n; | |
1746 | for (int i = 1; i < num; ++i) | |
1747 | { | |
1748 | list_of_nodes = new cat_node(list_of_nodes, n->clone()); | |
1749 | } | |
1750 | return list_of_nodes; | |
1751 | } | |
1752 | ||
1753 | inline | |
1754 | node* optional_node(node* n) | |
1755 | { | |
1756 | return new or_node(n, new epsilon_node()); | |
1757 | } | |
1758 | ||
1759 | template <typename ScannerT> | |
1760 | class make_rep1 | |
1761 | { | |
1762 | typedef typename ScannerT::iterator_t iterator_type; | |
1763 | ||
1764 | public: | |
1765 | typedef | |
1766 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1767 | char_t; | |
1768 | ||
1769 | std::stack<node*>& m_stack; | |
1770 | ||
1771 | make_rep1(std::stack<node*>& the_stack) | |
1772 | : m_stack(the_stack) | |
1773 | {} | |
1774 | ||
1775 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1776 | { | |
1777 | BOOST_ASSERT(*first == '{'); | |
1778 | ||
1779 | iterator_type first_ = first; | |
1780 | ScannerT scan(first_, last); | |
1781 | ++scan.first; // skip over '{' | |
1782 | ||
1783 | unsigned int count; | |
1784 | uint_p[assign(count)].parse(scan); | |
1785 | if (count == 0) | |
1786 | boost::throw_exception(bad_regex()); | |
1787 | ||
1788 | node* top_node = m_stack.top(); | |
1789 | m_stack.pop(); | |
1790 | top_node = repeat_node(top_node, count); | |
1791 | m_stack.push(top_node); | |
1792 | } | |
1793 | }; | |
1794 | ||
1795 | template <typename ScannerT> | |
1796 | class make_rep2 | |
1797 | { | |
1798 | typedef typename ScannerT::iterator_t iterator_type; | |
1799 | ||
1800 | public: | |
1801 | typedef | |
1802 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1803 | char_t; | |
1804 | ||
1805 | std::stack<node*>& m_stack; | |
1806 | ||
1807 | make_rep2(std::stack<node*>& the_stack) | |
1808 | : m_stack(the_stack) | |
1809 | {} | |
1810 | ||
1811 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1812 | { | |
1813 | BOOST_ASSERT(*first == '{'); | |
1814 | ||
1815 | iterator_type first_ = first; | |
1816 | ScannerT scan (first_, last); | |
1817 | ++scan.first; // skip over '{' | |
1818 | ||
1819 | unsigned int count; | |
1820 | uint_p[assign(count)].parse(scan); | |
1821 | if (count == 0) | |
1822 | boost::throw_exception(bad_regex()); | |
1823 | ||
1824 | node* top_node = m_stack.top(); | |
1825 | m_stack.pop(); | |
1826 | top_node = new cat_node(repeat_node(top_node, count), | |
1827 | new star_node(top_node->clone())); | |
1828 | m_stack.push(top_node); | |
1829 | ||
1830 | } | |
1831 | }; | |
1832 | ||
1833 | template <typename ScannerT> | |
1834 | class make_rep3 | |
1835 | { | |
1836 | typedef typename ScannerT::iterator_t iterator_type; | |
1837 | ||
1838 | public: | |
1839 | typedef | |
1840 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1841 | char_t; | |
1842 | ||
1843 | std::stack<node*>& m_stack; | |
1844 | ||
1845 | make_rep3(std::stack<node*>& the_stack) | |
1846 | : m_stack(the_stack) | |
1847 | {} | |
1848 | ||
1849 | void operator()(iterator_type const& first, iterator_type const& last) const | |
1850 | { | |
1851 | BOOST_ASSERT(*first == '{'); | |
1852 | ||
1853 | iterator_type first_ = first; | |
1854 | ScannerT scan(first_, last); | |
1855 | ++scan.first; // skip over '{' | |
1856 | ||
1857 | unsigned int count1, count2; | |
1858 | uint_p[assign(count1)].parse(scan); | |
1859 | if (count1 == 0) | |
1860 | boost::throw_exception(bad_regex()); | |
1861 | ||
1862 | ++scan.first; // skip over ',' | |
1863 | ||
1864 | uint_p[assign(count2)].parse(scan); | |
1865 | if (count2 <= count1) | |
1866 | boost::throw_exception(bad_regex()); | |
1867 | ||
1868 | node* top_node = m_stack.top(); | |
1869 | m_stack.pop(); | |
1870 | node* repeats = repeat_node(top_node, count1); | |
1871 | top_node = new cat_node(repeats, | |
1872 | repeat_node(optional_node(top_node->clone()), | |
1873 | count2 - count1)); | |
1874 | ||
1875 | m_stack.push(top_node); | |
1876 | } | |
1877 | }; | |
1878 | ||
1879 | /////////////////////////////////////////////////////////////////////////////// | |
1880 | // | |
1881 | // Lexer grammar | |
1882 | // | |
1883 | // Defines the grammar, which mandates the syntax of the understood | |
1884 | // lexeme definitions passed to lexer::register_regex. | |
1885 | // | |
1886 | /////////////////////////////////////////////////////////////////////////////// | |
1887 | class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar> | |
1888 | { | |
1889 | public: | |
1890 | lexer_grammar(std::stack<node*> &node_stack_) | |
1891 | : node_stack(node_stack_) {} | |
1892 | ||
1893 | template <typename ScannerT> | |
1894 | struct definition | |
1895 | { | |
1896 | typedef rule<ScannerT> rule_t; | |
1897 | typedef typename ScannerT::iterator_t iterator_type; | |
1898 | typedef | |
1899 | typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type | |
1900 | char_t; | |
1901 | ||
1902 | rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, | |
1903 | escseq, ccl_char; | |
1904 | symbols<> ccl_expr; | |
1905 | ||
1906 | definition(lexer_grammar const &self) | |
1907 | { | |
1908 | regex = | |
1909 | re >> !('/' >> re) >> !ch_p('$') | |
1910 | ; | |
1911 | ||
1912 | re = | |
1913 | series | |
1914 | >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] ) | |
1915 | ; | |
1916 | ||
1917 | series = | |
1918 | singleton | |
1919 | >>*( singleton[make_concat<ScannerT>(self.node_stack)] ) | |
1920 | ; | |
1921 | ||
1922 | singleton = | |
1923 | ch_p('.')[make_any_char<ScannerT>(self.node_stack)] | |
1924 | >> singleton2 | |
1925 | | fullccl | |
1926 | >> singleton2 | |
1927 | | ('"' >> string >> '"') | |
1928 | [ | |
1929 | make_string<ScannerT>(self.node_stack) | |
1930 | ] | |
1931 | >> singleton2 | |
1932 | | '(' >> re >> ')' | |
1933 | >> singleton2 | |
1934 | | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) | |
1935 | [ | |
1936 | make_char<ScannerT>(self.node_stack) | |
1937 | ] | |
1938 | >> singleton2 | |
1939 | ; | |
1940 | ||
1941 | singleton2 = | |
1942 | ch_p('*')[make_star<ScannerT>(self.node_stack)] | |
1943 | >> singleton2 | |
1944 | | ch_p('+')[make_plus<ScannerT>(self.node_stack)] | |
1945 | >> singleton2 | |
1946 | | ch_p('?')[make_optional<ScannerT>(self.node_stack)] | |
1947 | >> singleton2 | |
1948 | | ('{' >> uint_p >> '}') | |
1949 | [ | |
1950 | make_rep1<ScannerT>(self.node_stack) | |
1951 | ] | |
1952 | >> singleton2 | |
1953 | | ('{' >> uint_p >> ',' >> '}') | |
1954 | [ | |
1955 | make_rep2<ScannerT>(self.node_stack) | |
1956 | ] | |
1957 | >> singleton2 | |
1958 | | ('{' >> uint_p >> ',' >> uint_p >> '}') | |
1959 | [ | |
1960 | make_rep3<ScannerT>(self.node_stack) | |
1961 | ] | |
1962 | >> singleton2 | |
1963 | | epsilon_p | |
1964 | ; | |
1965 | ||
1966 | fullccl = | |
1967 | ('[' >> !ch_p('^') >> ccl >> ']') | |
1968 | [ | |
1969 | make_ccl<ScannerT>(self.node_stack) | |
1970 | ] | |
1971 | ; | |
1972 | ||
1973 | ccl = | |
1974 | *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) | |
1975 | ; | |
1976 | ||
1977 | ccl_char = | |
1978 | ( (anychar_p - chset<>("\\\n]")) | escseq ) | |
1979 | ; | |
1980 | ||
1981 | ccl_expr = | |
1982 | "[:alnum:]", | |
1983 | "[:alpha:]", | |
1984 | "[:blank:]", | |
1985 | "[:cntrl:]", | |
1986 | "[:digit:]", | |
1987 | "[:graph:]", | |
1988 | "[:lower:]", | |
1989 | "[:print:]", | |
1990 | "[:punct:]", | |
1991 | "[:space:]", | |
1992 | "[:upper:]", | |
1993 | "[:xdigit:]" | |
1994 | ; | |
1995 | ||
1996 | string = | |
1997 | +( (anychar_p - chset<>("\"\\")) | escseq ) | |
1998 | ; | |
1999 | ||
2000 | typedef | |
2001 | uint_parser<char_t, 8, 1, | |
2002 | std::numeric_limits<char_t>::digits / 3 + 1 | |
2003 | > oct_parser_t; | |
2004 | typedef | |
2005 | uint_parser<char_t, 16, 1, | |
2006 | std::numeric_limits<char_t>::digits / 4 + 1 | |
2007 | > hex_parser_t; | |
2008 | ||
2009 | escseq = | |
2010 | ch_p('\\') | |
2011 | >> ( | |
2012 | oct_parser_t() | |
2013 | | as_lower_d['x'] >> hex_parser_t() | |
2014 | | (anychar_p - chset<>('\n')) | |
2015 | ) | |
2016 | ; | |
2017 | ||
2018 | #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2019 | ||
2020 | BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); | |
2021 | BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); | |
2022 | BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); | |
2023 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); | |
2024 | BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); | |
2025 | BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); | |
2026 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); | |
2027 | BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); | |
2028 | BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); | |
2029 | BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); | |
2030 | ||
2031 | #undef BOOST_SLEX_DEBUG | |
2032 | } | |
2033 | ||
2034 | rule<ScannerT> const& | |
2035 | start() const { return regex; } | |
2036 | }; | |
2037 | ||
2038 | std::stack<node*> &node_stack; | |
2039 | ||
2040 | }; // class lexer_grammar | |
2041 | ||
2042 | template <typename StringT> | |
2043 | inline node * | |
2044 | parse(lexer_grammar& g, StringT const& str) | |
2045 | { | |
2046 | typedef | |
2047 | scanner<typename StringT::const_iterator, scanner_policies<> > | |
2048 | scanner_t; | |
2049 | typedef typename rule<scanner_t>::template result<scanner_t>::type | |
2050 | result_t; | |
2051 | ||
2052 | typename StringT::const_iterator first = str.begin(); | |
2053 | typename StringT::const_iterator last = str.end(); | |
2054 | ||
2055 | scanner_t scan(first, last); | |
2056 | // typename rule<scanner_t>::result_t hit = g.parse(scan); | |
2057 | result_t hit = g.parse(scan); | |
2058 | if (!hit || !scan.at_end()) | |
2059 | { | |
2060 | while (g.node_stack.size()) | |
2061 | { | |
2062 | delete g.node_stack.top(); | |
2063 | g.node_stack.pop(); | |
2064 | } | |
2065 | return 0; | |
2066 | } | |
2067 | ||
2068 | BOOST_ASSERT(g.node_stack.size() == 1); | |
2069 | node* rval = g.node_stack.top(); | |
2070 | g.node_stack.pop(); | |
2071 | node* an_eof_node = new eof_node(); | |
2072 | rval = new cat_node(rval, an_eof_node); | |
2073 | return rval; | |
2074 | } | |
2075 | ||
2076 | inline | |
2077 | void make_case_insensitive(state_match_t& state_match) | |
2078 | { | |
2079 | // TODO: Fix this. | |
2080 | // This doesn't take into account foreign languages, figure out how to | |
2081 | // do that. Also this approach is broken for this implementation of | |
2082 | // wide chars. | |
2083 | for (state_match_t::iterator iter = state_match.begin(); | |
2084 | iter != state_match.end(); ++iter) | |
2085 | { | |
2086 | int i, j; | |
2087 | for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) | |
2088 | { | |
2089 | // if either is set, turn them both on | |
2090 | (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); | |
2091 | } | |
2092 | } | |
2093 | } | |
2094 | ||
2095 | template<bool wide_char> | |
2096 | struct regex_match_helper; | |
2097 | ||
2098 | template<> | |
2099 | struct regex_match_helper<false> // single byte char | |
2100 | { | |
2101 | template <typename DfaT, typename IteratorT> | |
2102 | static bool | |
2103 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2104 | int& regex_index, | |
2105 | std::basic_string< | |
2106 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2107 | > *token) | |
2108 | { | |
2109 | typedef std::basic_string< | |
2110 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2111 | > string_type; | |
2112 | typedef typename string_type::size_type size_type; | |
2113 | ||
2114 | node_id_t s = 0; | |
2115 | node_id_t last_accepting_index = invalid_node; | |
2116 | IteratorT p = first; | |
2117 | IteratorT last_accepting_cpos = first; | |
2118 | while (p != last) | |
2119 | { | |
2120 | s = dfa.transition_table[s][(uchar)*p]; | |
2121 | if (s == invalid_node) | |
2122 | break; | |
2123 | if (token) token->append((size_type)1, *p); | |
2124 | ++p; | |
2125 | if (dfa.acceptance_index[s] != invalid_node) | |
2126 | { | |
2127 | last_accepting_index = s; | |
2128 | last_accepting_cpos = p; | |
2129 | } | |
2130 | } | |
2131 | if (last_accepting_index != invalid_node) | |
2132 | { | |
2133 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2134 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2135 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2136 | #endif | |
2137 | ||
2138 | first = last_accepting_cpos; | |
2139 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2140 | return true; | |
2141 | } | |
2142 | else | |
2143 | return false; | |
2144 | } | |
2145 | }; | |
2146 | ||
2147 | template<> | |
2148 | struct regex_match_helper<true> // wide char | |
2149 | { | |
2150 | template <typename DfaT, typename IteratorT> | |
2151 | static bool | |
2152 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2153 | int& regex_index, | |
2154 | std::basic_string< | |
2155 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2156 | > *token) | |
2157 | { | |
2158 | typedef | |
2159 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2160 | char_t; | |
2161 | typedef std::basic_string<char_t> string_type; | |
2162 | typedef typename string_type::size_type size_type; | |
2163 | ||
2164 | node_id_t s = 0; | |
2165 | node_id_t last_accepting_index = invalid_node; | |
2166 | IteratorT wp = first; | |
2167 | IteratorT last_accepting_cpos = first; | |
2168 | ||
2169 | while (wp != last) | |
2170 | { | |
2171 | for (unsigned int i = 0; i < sizeof(char_t); ++i) | |
2172 | { | |
2173 | s = dfa.transition_table[s][get_byte(*wp,i)]; | |
2174 | if (s == invalid_node) | |
2175 | { | |
2176 | goto break_while; | |
2177 | } | |
2178 | } | |
2179 | if (token) token->append((size_type)1, *wp); | |
2180 | ++wp; | |
2181 | if (dfa.acceptance_index[s] != invalid_node) | |
2182 | { | |
2183 | last_accepting_index = s; | |
2184 | last_accepting_cpos = wp; | |
2185 | } | |
2186 | ||
2187 | } | |
2188 | ||
2189 | break_while: | |
2190 | if (last_accepting_index != invalid_node) | |
2191 | { | |
2192 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2193 | std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << | |
2194 | dfa.acceptance_index[last_accepting_index] << '\n'; | |
2195 | #endif | |
2196 | first = last_accepting_cpos; | |
2197 | regex_index = dfa.acceptance_index[last_accepting_index]; | |
2198 | ||
2199 | return true; | |
2200 | } | |
2201 | else | |
2202 | return false; | |
2203 | } | |
2204 | }; | |
2205 | ||
2206 | template <typename DfaT, typename IteratorT, bool wide_char> | |
2207 | struct regex_match | |
2208 | { | |
2209 | static bool | |
2210 | do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, | |
2211 | int& regex_index, | |
2212 | std::basic_string< | |
2213 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2214 | > *token) | |
2215 | { | |
2216 | return regex_match_helper<wide_char>::do_match( | |
2217 | dfa, first, last, regex_index, token); | |
2218 | } | |
2219 | }; | |
2220 | ||
2221 | } // namespace lexerimpl | |
2222 | ||
2223 | /////////////////////////////////////////////////////////////////////////////// | |
2224 | // | |
2225 | template <typename IteratorT = char const*, typename TokenT = int, | |
2226 | typename CallbackT = void(*)(IteratorT const &, | |
2227 | IteratorT &, | |
2228 | IteratorT const&, | |
2229 | TokenT const&, | |
2230 | lexer_control<TokenT> &)> | |
2231 | class lexer | |
2232 | { | |
2233 | public: | |
2234 | typedef CallbackT callback_t; | |
2235 | typedef | |
2236 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2237 | char_t; | |
2238 | ||
2239 | struct regex_info | |
2240 | { | |
2241 | std::basic_string<char_t> str; | |
2242 | TokenT token; | |
2243 | CallbackT callback; | |
2244 | ||
2245 | regex_info(const std::basic_string<char_t>& _str, | |
2246 | const TokenT& _token, | |
2247 | const CallbackT& _callback) | |
2248 | : str(_str) | |
2249 | , token(_token) | |
2250 | , callback(_callback) | |
2251 | {} | |
2252 | ||
2253 | }; | |
2254 | ||
2255 | struct dfa_table | |
2256 | { | |
2257 | std::vector<std::vector<node_id_t> > transition_table; | |
2258 | std::vector<node_id_t> acceptance_index; | |
2259 | }; | |
2260 | typedef std::vector<node_id_t> node_table_t; | |
2261 | typedef std::vector<node_table_t> transition_table_t; | |
2262 | typedef std::vector<dfa_table> dfa_t; | |
2263 | ||
2264 | ||
2265 | lexer(unsigned int states = 1); | |
2266 | ||
2267 | void register_regex(const std::basic_string<char_t>& regex, | |
2268 | const TokenT& id, const CallbackT& cb = CallbackT(), | |
2269 | unsigned int state = 0); | |
2270 | ||
2271 | TokenT next_token(IteratorT &first, IteratorT const &last, | |
2272 | std::basic_string<char_t> *token = 0); | |
2273 | ||
2274 | void create_dfa(); | |
2275 | bool has_compiled_dfa() { return m_compiled_dfa; } | |
2276 | ||
2277 | void set_case_insensitive(bool insensitive); | |
2278 | ||
2279 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2280 | void dump(std::ostream& out); | |
2281 | #endif | |
2282 | typedef std::vector<std::vector<regex_info> > regex_list_t; | |
2283 | ||
2284 | bool load (std::ifstream &in, long unique_id = 0); | |
2285 | bool save (std::ofstream &out, long unique_id = 0); | |
2286 | enum { | |
2287 | SLEX_SIGNATURE = 0x58454C53, // "SLEX" | |
2288 | SLEX_VERSION_100 = 0x0100, // persistance version | |
2289 | SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, | |
2290 | SLEX_MINOR_VERSION_MASK = 0xFF | |
2291 | }; | |
2292 | ||
2293 | private: | |
2294 | ||
2295 | void create_dfa_for_state(int state); | |
2296 | ||
2297 | static bool regex_match(const dfa_t& dfa, IteratorT& first, | |
2298 | IteratorT& last, int& regex_index); | |
2299 | ||
2300 | mutable std::stack<lexerimpl::node*> node_stack; | |
2301 | lexerimpl::lexer_grammar g; | |
2302 | ||
2303 | mutable bool m_compiled_dfa; | |
2304 | mutable dfa_t m_dfa; | |
2305 | ||
2306 | regex_list_t m_regex_list; | |
2307 | bool m_case_insensitive; | |
2308 | ||
2309 | unsigned int m_state; | |
2310 | std::stack<unsigned int> m_state_stack; | |
2311 | unsigned int m_num_states; | |
2312 | }; | |
2313 | ||
2314 | ||
2315 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2316 | inline | |
2317 | lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) | |
2318 | : g(node_stack) | |
2319 | , m_compiled_dfa(false) | |
2320 | , m_regex_list(states) | |
2321 | , m_case_insensitive(false) | |
2322 | , m_state(0) | |
2323 | , m_state_stack() | |
2324 | , m_num_states(states) | |
2325 | { | |
2326 | BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", | |
2327 | BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX); | |
2328 | } | |
2329 | ||
2330 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2331 | inline void | |
2332 | lexer<IteratorT, TokenT, CallbackT>::register_regex( | |
2333 | const std::basic_string<char_t>& regex, const TokenT& id, | |
2334 | const CallbackT& callback, unsigned int state) | |
2335 | { | |
2336 | if (state > m_num_states) { | |
2337 | m_regex_list.resize(state); | |
2338 | m_num_states = state; | |
2339 | } | |
2340 | m_regex_list[state].push_back(regex_info(regex, id, callback)); | |
2341 | } | |
2342 | ||
2343 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2344 | inline TokenT | |
2345 | lexer<IteratorT, TokenT, CallbackT>::next_token( | |
2346 | IteratorT &first, IteratorT const& last, | |
2347 | std::basic_string< | |
2348 | typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type | |
2349 | > *token) | |
2350 | { | |
2351 | if (!m_compiled_dfa) | |
2352 | { | |
2353 | create_dfa(); | |
2354 | } | |
2355 | ||
2356 | IteratorT saved = first; | |
2357 | int regex_index; | |
2358 | if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: | |
2359 | do_match(m_dfa[m_state], first, last, regex_index, token)) | |
2360 | return -1; // TODO: can't return -1, need to return some invalid token. | |
2361 | // how to figure this out? We can use traits I guess. | |
2362 | else | |
2363 | { | |
2364 | regex_info regex = m_regex_list[m_state][regex_index]; | |
2365 | TokenT rval = regex.token; | |
2366 | if (regex.callback) | |
2367 | { | |
2368 | // execute corresponding callback | |
2369 | lexer_control<TokenT> controller(rval, m_state, m_state_stack); | |
2370 | regex.callback(saved, first, last, regex.token, controller); | |
2371 | if (controller.ignore_current_token_set()) { | |
2372 | if (token) | |
2373 | token->erase(); | |
2374 | return next_token(first, last, token); | |
2375 | } | |
2376 | } | |
2377 | return rval; | |
2378 | } | |
2379 | } | |
2380 | ||
2381 | namespace lexerimpl | |
2382 | { | |
2383 | ||
2384 | inline | |
2385 | bool find_acceptance_state(const node_set& eof_node_ids, | |
2386 | const node_set& current_set, | |
2387 | node_id_t& acceptance_node_id) | |
2388 | { | |
2389 | for(node_set::const_iterator nsi = eof_node_ids.begin(); | |
2390 | nsi != eof_node_ids.end(); ++nsi) | |
2391 | { | |
2392 | node_id_t eof_node_id =*nsi; | |
2393 | if (current_set.end() != current_set.find(eof_node_id)) | |
2394 | { | |
2395 | // store the first one we come to as the | |
2396 | // matched pattern | |
2397 | acceptance_node_id = eof_node_id; | |
2398 | // don't bother searching for more | |
2399 | return true; | |
2400 | } | |
2401 | } | |
2402 | return false; | |
2403 | } | |
2404 | ||
2405 | template <typename RegexListT, typename GrammarT> | |
2406 | inline std::auto_ptr<node> | |
2407 | parse_regexes(const RegexListT& regex_list, GrammarT& g) | |
2408 | { | |
2409 | // parse the expressions into a tree | |
2410 | if (regex_list.begin() == regex_list.end()) | |
2411 | boost::throw_exception(bad_regex()); | |
2412 | ||
2413 | typename RegexListT::const_iterator ri = regex_list.begin(); | |
2414 | std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); | |
2415 | if (tree.get() == 0) | |
2416 | boost::throw_exception(bad_regex()); | |
2417 | ||
2418 | ++ri; | |
2419 | for (/**/; ri != regex_list.end(); ++ri) | |
2420 | { | |
2421 | std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); | |
2422 | if (next_tree.get() == 0) | |
2423 | boost::throw_exception(bad_regex()); | |
2424 | std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release())); | |
2425 | tree = newnode; | |
2426 | } | |
2427 | return tree; | |
2428 | } | |
2429 | ||
2430 | } //namespace lexerimpl | |
2431 | ||
2432 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2433 | inline void | |
2434 | lexer<IteratorT, TokenT, CallbackT>::create_dfa() | |
2435 | { | |
2436 | m_dfa.resize(m_num_states); | |
2437 | for (unsigned int i = 0; i < m_num_states; ++i) | |
2438 | create_dfa_for_state(i); | |
2439 | } | |
2440 | ||
2441 | // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 | |
2442 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2443 | inline void | |
2444 | lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state) | |
2445 | { | |
2446 | using lexerimpl::node; | |
2447 | std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); | |
2448 | node_id_t dummy = 0; | |
2449 | tree->assign_node_ids(dummy); | |
2450 | ||
2451 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2452 | tree->dump(std::cout); | |
2453 | #endif | |
2454 | ||
2455 | // compute followpos(root) | |
2456 | followpos_t followpos; | |
2457 | tree->compute_followpos(followpos); | |
2458 | ||
2459 | // the dfa states <-> nfa state groups | |
2460 | std::map<node_set, node_id_t> dstates1; | |
2461 | std::map<node_id_t, node_set> dstates2; | |
2462 | ||
2463 | // the dfa transitions | |
2464 | m_dfa[state].transition_table.push_back( | |
2465 | std::vector<node_id_t>(256, invalid_node)); | |
2466 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2467 | ||
2468 | // whether the dfa state has been processed yet | |
2469 | std::vector<node_id_t> marked; | |
2470 | ||
2471 | // used to give a unique id to each dfa state | |
2472 | node_id_t num_states = 0; | |
2473 | ||
2474 | // initially, the only unmarked state in Dstates is firstpos(root). | |
2475 | marked.push_back(0); | |
2476 | node_set fpr = tree->firstpos(); | |
2477 | dstates1[fpr] = 0; | |
2478 | dstates2[0] = fpr; | |
2479 | state_match_t state_match; | |
2480 | tree->compute_state_match(state_match); | |
2481 | ||
2482 | if (m_case_insensitive) | |
2483 | lexerimpl::make_case_insensitive(state_match); | |
2484 | ||
2485 | node_set eof_node_ids; | |
2486 | tree->get_eof_ids(eof_node_ids); | |
2487 | // translate the eof_node_ids into a 0-based index | |
2488 | std::map<node_id_t, node_id_t> eof_node_id_map; | |
2489 | unsigned int x = 0; | |
2490 | for (node_set::iterator node_id_it = eof_node_ids.begin(); | |
2491 | node_id_it != eof_node_ids.end(); | |
2492 | ++node_id_it) | |
2493 | { | |
2494 | eof_node_id_map[*node_id_it] = x++; | |
2495 | } | |
2496 | ||
2497 | // figure out if this is an acceptance state | |
2498 | node_id_t eof_node_id; | |
2499 | if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) | |
2500 | { | |
2501 | m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; | |
2502 | } | |
2503 | ||
2504 | std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(), | |
2505 | node_id_t(0)); | |
2506 | while (marked.end() != i) | |
2507 | { | |
2508 | *i = 1; | |
2509 | node_id_t T = node_id_t(std::distance(marked.begin(), i)); | |
2510 | BOOST_ASSERT(T < dstates2.size()); | |
2511 | node_set Tstates = dstates2[T]; | |
2512 | for (node_id_t j = 0; j < 256; ++j) | |
2513 | { | |
2514 | node_set U; | |
2515 | for (node_set::iterator k = Tstates.begin(); | |
2516 | k != Tstates.end(); ++k) | |
2517 | { | |
2518 | node_id_t p =*k; | |
2519 | BOOST_ASSERT(p < state_match.size()); | |
2520 | BOOST_ASSERT(j < state_match[p].size()); | |
2521 | if (state_match[p][j]) | |
2522 | { | |
2523 | node_set fpp = followpos[p]; | |
2524 | U.insert(fpp.begin(), fpp.end()); | |
2525 | } | |
2526 | } | |
2527 | if (U.size() > 0) | |
2528 | { | |
2529 | std::map<node_set, node_id_t>::iterator l = dstates1.find(U); | |
2530 | node_id_t target_state; | |
2531 | if (l == dstates1.end()) // not in the states yet | |
2532 | { | |
2533 | ++num_states; | |
2534 | dstates1[U] = target_state = num_states; | |
2535 | dstates2[target_state] = U; | |
2536 | marked.push_back(0); | |
2537 | m_dfa[state].transition_table.push_back( | |
2538 | std::vector<node_id_t>(256, invalid_node)); | |
2539 | m_dfa[state].acceptance_index.push_back(invalid_node); | |
2540 | // figure out if this is an acceptance state | |
2541 | node_id_t eof_node_id; | |
2542 | if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) | |
2543 | { | |
2544 | m_dfa[state].acceptance_index[target_state] = | |
2545 | eof_node_id_map[eof_node_id]; | |
2546 | } | |
2547 | } | |
2548 | else | |
2549 | { | |
2550 | target_state = dstates1[U]; | |
2551 | } | |
2552 | ||
2553 | BOOST_ASSERT(T < m_dfa[state].transition_table.size()); | |
2554 | BOOST_ASSERT(j < m_dfa[state].transition_table[T].size()); | |
2555 | m_dfa[state].transition_table[T][j] = target_state; | |
2556 | } | |
2557 | ||
2558 | } | |
2559 | ||
2560 | i = std::find(marked.begin(), marked.end(), node_id_t(0)); | |
2561 | } | |
2562 | m_compiled_dfa = true; | |
2563 | } | |
2564 | ||
2565 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2566 | inline void | |
2567 | lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive) | |
2568 | { | |
2569 | m_case_insensitive = insensitive; | |
2570 | } | |
2571 | ||
2572 | ||
2573 | #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) | |
2574 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2575 | inline void | |
2576 | lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out) | |
2577 | { | |
2578 | for (unsigned x = 0; x < m_dfa.size(); ++x) | |
2579 | { | |
2580 | out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; | |
2581 | for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) | |
2582 | { | |
2583 | out << "state " << i << ":"; | |
2584 | for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) | |
2585 | { | |
2586 | if (m_dfa[x].transition_table[i][j] != invalid_node) | |
2587 | out << j << "->" << m_dfa[x].transition_table[i][j] << " "; | |
2588 | } | |
2589 | out << "\n"; | |
2590 | } | |
2591 | out << "acceptance states: "; | |
2592 | for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) | |
2593 | { | |
2594 | if (m_dfa[x].acceptance_index[k] != invalid_node) | |
2595 | out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; | |
2596 | } | |
2597 | out << endl; | |
2598 | } | |
2599 | } | |
2600 | #endif | |
2601 | ||
2602 | /////////////////////////////////////////////////////////////////////////////// | |
2603 | // load the lexer tables | |
2604 | #define slex_in(strm, val) \ | |
2605 | strm.read((char*)&val, sizeof(val)); \ | |
2606 | if (std::ios::goodbit != strm.rdstate()) return false | |
2607 | ||
2608 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2609 | inline bool | |
2610 | lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id) | |
2611 | { | |
2612 | // ensure correct signature and version | |
2613 | long signature = 0; | |
2614 | ||
2615 | slex_in (in, signature); | |
2616 | if (signature != SLEX_SIGNATURE) | |
2617 | return false; // not for us | |
2618 | ||
2619 | long version = 0; | |
2620 | ||
2621 | slex_in (in, version); | |
2622 | if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) | |
2623 | return false; // to new for us | |
2624 | ||
2625 | long uid = 0; | |
2626 | ||
2627 | slex_in (in, uid); | |
2628 | if (uid != unique_id) | |
2629 | return false; // not saved by us | |
2630 | ||
2631 | // load auxiliary members | |
2632 | int num_states = 0; | |
2633 | ||
2634 | slex_in (in, num_states); | |
2635 | ||
2636 | // load the dfa tables | |
2637 | dfa_t in_dfa; | |
2638 | std::size_t dfa_size = 0; | |
2639 | ||
2640 | slex_in (in, dfa_size); | |
2641 | in_dfa.resize(dfa_size); | |
2642 | for (std::size_t dfa = 0; dfa < dfa_size; ++dfa) | |
2643 | { | |
2644 | // load the transition tables | |
2645 | std::size_t tt_size = 0; | |
2646 | transition_table_t &tt_table = in_dfa[dfa].transition_table; | |
2647 | ||
2648 | slex_in (in, tt_size); | |
2649 | tt_table.resize(tt_size); | |
2650 | for (std::size_t tt = 0; tt < tt_size; ++tt) | |
2651 | { | |
2652 | std::size_t nt_size = 0; | |
2653 | node_table_t &nt_table = tt_table[tt]; | |
2654 | ||
2655 | slex_in (in, nt_size); | |
2656 | nt_table.resize(nt_size); | |
2657 | for (std::size_t nt = 0; nt < nt_size; ++nt) | |
2658 | { | |
2659 | slex_in (in, nt_table[nt]); | |
2660 | } | |
2661 | } | |
2662 | ||
2663 | // load the acceptance index table | |
2664 | std::size_t ai_size = 0; | |
2665 | node_table_t &ai_table = in_dfa[dfa].acceptance_index; | |
2666 | ||
2667 | slex_in (in, ai_size); | |
2668 | ai_table.resize(ai_size); | |
2669 | for (std::size_t ai = 0; ai < ai_size; ++ai) | |
2670 | { | |
2671 | slex_in (in, ai_table[ai]); | |
2672 | } | |
2673 | } | |
2674 | ||
2675 | m_dfa.swap(in_dfa); // success, swap in the read values | |
2676 | m_num_states = num_states; | |
2677 | ||
2678 | m_compiled_dfa = true; | |
2679 | return true; | |
2680 | } | |
2681 | ||
2682 | #undef slex_in | |
2683 | ||
2684 | /////////////////////////////////////////////////////////////////////////////// | |
2685 | // save the lexer tables | |
2686 | #define slex_out(strm, val) \ | |
2687 | strm.write((char*)&val, sizeof(val)); \ | |
2688 | if (std::ios::goodbit != strm.rdstate()) return false | |
2689 | ||
2690 | template <typename IteratorT, typename TokenT, typename CallbackT> | |
2691 | inline bool | |
2692 | lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id) | |
2693 | { | |
2694 | // save signature and version information | |
2695 | long out_long = SLEX_SIGNATURE; | |
2696 | ||
2697 | slex_out(out, out_long); | |
2698 | out_long = SLEX_VERSION_100; | |
2699 | slex_out(out, out_long); | |
2700 | slex_out(out, unique_id); | |
2701 | ||
2702 | // save auxiliary members | |
2703 | slex_out(out, m_num_states); | |
2704 | ||
2705 | // save the dfa tables | |
2706 | typedef typename dfa_t::const_iterator dfa_iter_t; | |
2707 | typedef transition_table_t::const_iterator transition_table_iter_t; | |
2708 | typedef node_table_t::const_iterator node_table_iter_t; | |
2709 | ||
2710 | std::size_t out_size_t = m_dfa.size(); | |
2711 | slex_out(out, out_size_t); | |
2712 | ||
2713 | dfa_iter_t end = m_dfa.end(); | |
2714 | for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) | |
2715 | { | |
2716 | // save the transition table | |
2717 | out_size_t = (*it).transition_table.size(); | |
2718 | slex_out(out, out_size_t); | |
2719 | ||
2720 | transition_table_iter_t tt_end = (*it).transition_table.end(); | |
2721 | for (transition_table_iter_t tt_it = (*it).transition_table.begin(); | |
2722 | tt_it != tt_end; | |
2723 | ++tt_it) | |
2724 | { | |
2725 | out_size_t = (*tt_it).size(); | |
2726 | slex_out(out, out_size_t); | |
2727 | ||
2728 | node_table_iter_t nt_end = (*tt_it).end(); | |
2729 | for (node_table_iter_t nt_it = (*tt_it).begin(); | |
2730 | nt_it != nt_end; | |
2731 | ++nt_it) | |
2732 | { | |
2733 | slex_out(out, (*nt_it)); | |
2734 | } | |
2735 | } | |
2736 | ||
2737 | // save the acceptance index table | |
2738 | out_size_t = (*it).acceptance_index.size(); | |
2739 | slex_out(out, out_size_t); | |
2740 | ||
2741 | node_table_iter_t nt_end = (*it).acceptance_index.end(); | |
2742 | for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); | |
2743 | nt_it != nt_end; | |
2744 | ++nt_it) | |
2745 | { | |
2746 | slex_out(out, (*nt_it)); | |
2747 | } | |
2748 | } | |
2749 | return true; | |
2750 | } | |
2751 | ||
2752 | #undef slex_out | |
2753 | ||
2754 | /* | |
2755 | a lexer_control object supports some operations on the lexer. | |
2756 | get current lexer state | |
2757 | set state | |
2758 | terminate | |
2759 | state stack (push, pop, top) | |
2760 | set new token return value | |
2761 | ignore the current token | |
2762 | yymore | |
2763 | get length of matched token | |
2764 | */ | |
2765 | template <typename TokenT> | |
2766 | class lexer_control | |
2767 | { | |
2768 | public: | |
2769 | ||
2770 | lexer_control(TokenT& token, unsigned int& current_state, | |
2771 | std::stack<unsigned int>& state_stack); | |
2772 | // functions dealing with the lexer state | |
2773 | ||
2774 | // set the state to state | |
2775 | void set_state(unsigned int state); | |
2776 | ||
2777 | // get the current state | |
2778 | unsigned int get_state(); | |
2779 | ||
2780 | // pushes the current state onto the top of the state stack and | |
2781 | // switches to new_state | |
2782 | void push_state(unsigned int new_state); | |
2783 | ||
2784 | // pops the top of the state stack and switches to it. | |
2785 | void pop_state(); | |
2786 | ||
2787 | // returns the top of the stack without altering the stack's contents. | |
2788 | unsigned int top_state(); | |
2789 | ||
2790 | // functions dealing with the token returned. | |
2791 | ||
2792 | // set a new TokenT return value, overriding that one that was | |
2793 | // registered via. register_regex() | |
2794 | void set_token(const TokenT& token); | |
2795 | ||
2796 | // tell the lexer to return an invalid token, signifying termination. | |
2797 | void terminate(); | |
2798 | ||
2799 | // ignore the current token, and move on to the next one. The current | |
2800 | // token will NOT be returned from next_token() | |
2801 | void ignore_current_token(); | |
2802 | ||
2803 | // returns true if ignore_current_token() has been called, | |
2804 | // false otherwise. | |
2805 | bool ignore_current_token_set(); | |
2806 | ||
2807 | private: | |
2808 | TokenT& m_token; | |
2809 | bool m_ignore_current_token; | |
2810 | unsigned int& m_current_state; | |
2811 | std::stack<unsigned int>& m_state_stack; | |
2812 | }; | |
2813 | ||
2814 | template <typename TokenT> | |
2815 | inline | |
2816 | lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state, | |
2817 | std::stack<unsigned int>& state_stack) | |
2818 | : m_token(token) | |
2819 | , m_ignore_current_token(false) | |
2820 | , m_current_state(current_state) | |
2821 | , m_state_stack(state_stack) | |
2822 | { | |
2823 | } | |
2824 | ||
2825 | template <typename TokenT> | |
2826 | inline void | |
2827 | lexer_control<TokenT>::set_state(unsigned int state) | |
2828 | { | |
2829 | m_current_state = state; | |
2830 | } | |
2831 | ||
2832 | template <typename TokenT> | |
2833 | inline unsigned int | |
2834 | lexer_control<TokenT>::get_state() | |
2835 | { | |
2836 | return m_current_state; | |
2837 | } | |
2838 | ||
2839 | template <typename TokenT> | |
2840 | inline void | |
2841 | lexer_control<TokenT>::push_state(unsigned int new_state) | |
2842 | { | |
2843 | m_state_stack.push(m_current_state); | |
2844 | m_current_state = new_state; | |
2845 | } | |
2846 | ||
2847 | template <typename TokenT> | |
2848 | inline void | |
2849 | lexer_control<TokenT>::pop_state() | |
2850 | { | |
2851 | m_current_state = m_state_stack.top(); | |
2852 | m_state_stack.pop(); | |
2853 | } | |
2854 | ||
2855 | template <typename TokenT> | |
2856 | inline unsigned int | |
2857 | lexer_control<TokenT>::top_state() | |
2858 | { | |
2859 | return m_state_stack.top(); | |
2860 | } | |
2861 | ||
2862 | template <typename TokenT> | |
2863 | inline void | |
2864 | lexer_control<TokenT>::set_token(const TokenT& token) | |
2865 | { | |
2866 | m_token = token; | |
2867 | } | |
2868 | ||
2869 | template <typename TokenT> | |
2870 | inline void | |
2871 | lexer_control<TokenT>::terminate() | |
2872 | { | |
2873 | m_token = -1; // TOOD: fix this, need to be determined by traits | |
2874 | } | |
2875 | ||
2876 | template <typename TokenT> | |
2877 | inline void | |
2878 | lexer_control<TokenT>::ignore_current_token() | |
2879 | { | |
2880 | m_ignore_current_token = true; | |
2881 | } | |
2882 | ||
2883 | template <typename TokenT> | |
2884 | inline bool | |
2885 | lexer_control<TokenT>::ignore_current_token_set() | |
2886 | { | |
2887 | return m_ignore_current_token; | |
2888 | } | |
2889 | ||
2890 | } // namespace classic | |
2891 | } // namespace spirit | |
2892 | } // namespace boost | |
2893 | ||
2894 | #undef BOOST_SPIRIT_IT_NS | |
2895 | ||
2896 | #endif | |
2897 |