1 /*=============================================================================
2 Boost.Wave: A Standard compliant C++ preprocessor library
8 Copyright (c) 2001, Daniel C. Nuffer.
9 Copyright (c) 2001-2012 Hartmut Kaiser.
10 Distributed under the Boost Software License, Version 1.0. (See accompanying
11 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
14 X callback objects (called when a match is made.)
15 X callback passed first & last iterator, and
16 a reference to a lexercontrol object that supports some
17 operations on the lexer.
20 state stack (push, pop, top)
21 set new token return value
22 ignore the current token
24 get length of matched token
25 get current lexer state
26 X DFA serialization to save recomputing the DFA.
29 organize the file into hpp and ipp. arrange the classes in a logical order.
30 use buffering - iterator only needs be an input iterator,
31 lexer & callback are not templatized on iterator type, but char type.
32 next_token is templatized on iterator type.
33 beginning/ending contexts.
36 DFA table compression.
38 =============================================================================*/
39 #ifndef BOOST_SPIRIT_LEXER_HPP
40 #define BOOST_SPIRIT_LEXER_HPP
42 ///////////////////////////////////////////////////////////////////////////////
43 #include <boost/config.hpp>
44 #include <boost/throw_exception.hpp>
46 #include <boost/spirit/include/classic_core.hpp>
47 #include <boost/spirit/include/classic_symbols.hpp>
48 #include <boost/spirit/include/classic_chset.hpp>
49 #include <boost/spirit/include/classic_escape_char.hpp>
53 #include <memory> // for auto_ptr/unique_ptr
56 #include <utility> // for pair
57 #if defined(BOOST_SPIRIT_DEBUG)
61 #include <boost/assert.hpp>
62 #include <boost/limits.hpp>
64 #if defined(BOOST_NO_STD_ITERATOR_TRAITS)
65 #define BOOST_SPIRIT_IT_NS impl
67 #define BOOST_SPIRIT_IT_NS std
70 ///////////////////////////////////////////////////////////////////////////////
75 typedef unsigned char uchar;
76 typedef unsigned int node_id_t;
77 const node_id_t invalid_node = node_id_t(-1);
78 typedef std::set<node_id_t> node_set;
79 typedef std::vector<uchar> uchar_vector;
80 typedef std::map<node_id_t, node_set> followpos_t;
81 typedef std::vector<uchar_vector> state_match_t;
83 template <typename TokenT>
86 class bad_regex : public std::exception
99 virtual node* clone() const = 0;
100 virtual bool nullable() const = 0;
101 virtual node_set firstpos() const = 0;
102 virtual node_set lastpos() const = 0;
103 virtual void compute_followpos(followpos_t& followpos) const = 0;
104 virtual void compute_state_match(state_match_t& state_match) const = 0;
105 virtual void get_eof_ids(node_set& eof_set) const = 0;
106 virtual void assign_node_ids(node_id_t& node_count) = 0;
107 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
108 virtual void dump(std::ostream& out) const = 0;
113 class char_node : public node
118 char_node(const uchar c);
119 char_node(const char_node& x);
120 virtual ~char_node(){}
122 node* clone() const BOOST_OVERRIDE;
123 bool nullable() const BOOST_OVERRIDE;
124 node_set firstpos() const BOOST_OVERRIDE;
125 node_set lastpos() const BOOST_OVERRIDE;
126 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
127 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
128 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
129 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
130 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
131 void dump(std::ostream& out) const BOOST_OVERRIDE;
137 node_id_t m_node_num;
141 char_node::char_node(const uchar c)
149 char_node::char_node(const char_node& x)
152 , m_node_num(x.m_node_num)
157 char_node::clone() const
159 return new char_node(*this);
163 char_node::nullable() const
169 char_node::firstpos() const
172 rval.insert(m_node_num);
177 char_node::lastpos() const
183 char_node::compute_followpos(followpos_t&) const
189 char_node::compute_state_match(state_match_t& state_match) const
191 if (state_match.size() < m_node_num + 1)
192 state_match.resize(m_node_num + 1);
193 state_match[m_node_num].resize(256);
194 state_match[m_node_num][m_char] = 1;
198 char_node::get_eof_ids(node_set&) const
204 char_node::assign_node_ids(node_id_t& node_count)
206 m_node_num = node_count++;
209 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
211 char_node::dump(std::ostream& out) const
213 out << "\nchar_node m_char = " << m_char;
214 out << " m_node_num = " << m_node_num;
215 out << " nullable() = " << (nullable() ? "true" : "false");
216 out << " firstpos() = ";
217 node_set fp = firstpos();
218 std::copy(fp.begin(), fp.end(),
219 std::ostream_iterator<node_id_t>(out, ","));
221 out << " lastpos() = ";
222 node_set lp = lastpos();
223 std::copy(lp.begin(), lp.end(),
224 std::ostream_iterator<node_id_t>(out, ","));
229 class epsilon_node : public node
235 epsilon_node(const epsilon_node& x);
236 virtual ~epsilon_node(){}
238 node* clone() const BOOST_OVERRIDE;
239 bool nullable() const BOOST_OVERRIDE;
240 node_set firstpos() const BOOST_OVERRIDE;
241 node_set lastpos() const BOOST_OVERRIDE;
242 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
243 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
244 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
245 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
246 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
247 void dump(std::ostream& out) const BOOST_OVERRIDE;
252 node_id_t m_node_num;
256 epsilon_node::epsilon_node()
263 epsilon_node::epsilon_node(const epsilon_node& x)
265 , m_node_num(x.m_node_num)
270 epsilon_node::clone() const
272 return new epsilon_node(*this);
276 epsilon_node::nullable() const
282 epsilon_node::firstpos() const
288 epsilon_node::lastpos() const
294 epsilon_node::compute_followpos(followpos_t&) const
300 epsilon_node::compute_state_match(state_match_t& state_match) const
302 if (state_match.size() < m_node_num + 1)
303 state_match.resize(m_node_num + 1);
304 state_match[m_node_num].resize(256, 1);
308 epsilon_node::get_eof_ids(node_set&) const
314 epsilon_node::assign_node_ids(node_id_t& node_count)
316 m_node_num = node_count++;
319 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
321 epsilon_node::dump(std::ostream& out) const
323 out << "\nepsilon_node";
324 out << " m_node_num = " << m_node_num;
325 out << " nullable() = " << (nullable() ? "true" : "false");
326 out << " firstpos() = ";
327 node_set fp = firstpos();
328 std::copy(fp.begin(), fp.end(),
329 std::ostream_iterator<node_id_t>(out, ","));
331 out << " lastpos() = ";
332 node_set lp = lastpos();
333 std::copy(lp.begin(), lp.end(),
334 std::ostream_iterator<node_id_t>(out, ","));
339 class or_node : public node
344 or_node(node* left, node* right);
345 or_node(const or_node& x);
348 node* clone() const BOOST_OVERRIDE;
349 bool nullable() const BOOST_OVERRIDE;
350 node_set firstpos() const BOOST_OVERRIDE;
351 node_set lastpos() const BOOST_OVERRIDE;
352 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
353 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
354 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
355 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
356 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
357 void dump(std::ostream& out) const BOOST_OVERRIDE;
362 #ifndef BOOST_NO_CXX11_SMART_PTR
363 std::unique_ptr<node> m_left;
364 std::unique_ptr<node> m_right;
366 std::auto_ptr<node> m_left;
367 std::auto_ptr<node> m_right;
372 or_node::or_node(node* left, node* right)
380 or_node::or_node(const or_node& x)
382 , m_left(x.m_left->clone())
383 , m_right(x.m_right->clone())
388 or_node::clone() const
390 return new or_node(m_left->clone(), m_right->clone());
394 or_node::nullable() const
396 return m_left->nullable() || m_right->nullable();
400 or_node::firstpos() const
403 node_set l = m_left->firstpos();
404 node_set r = m_right->firstpos();
405 std::set_union(l.begin(), l.end(), r.begin(), r.end(),
406 std::inserter(rval, rval.begin()));
411 or_node::lastpos() const
414 node_set l = m_left->lastpos();
415 node_set r = m_right->lastpos();
416 std::set_union(l.begin(), l.end(), r.begin(), r.end(),
417 std::inserter(rval, rval.begin()));
422 or_node::compute_followpos(followpos_t& followpos) const
424 m_left->compute_followpos(followpos);
425 m_right->compute_followpos(followpos);
429 or_node::compute_state_match(state_match_t& state_match) const
431 m_left->compute_state_match(state_match);
432 m_right->compute_state_match(state_match);
436 or_node::get_eof_ids(node_set& eof_nodes) const
438 m_left->get_eof_ids(eof_nodes);
439 m_right->get_eof_ids(eof_nodes);
443 or_node::assign_node_ids(node_id_t& node_count)
445 m_left->assign_node_ids(node_count);
446 m_right->assign_node_ids(node_count);
449 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
451 or_node::dump(std::ostream& out) const
456 out << " nullable() = " << (nullable() ? "true" : "false");
457 out << " firstpos() = ";
458 node_set fp = firstpos();
459 std::copy(fp.begin(), fp.end(),
460 std::ostream_iterator<node_id_t>(out, ","));
462 out << " lastpos() = ";
463 node_set lp = lastpos();
464 std::copy(lp.begin(), lp.end(),
465 std::ostream_iterator<node_id_t>(out, ","));
472 class cat_node : public node
477 cat_node(node* left, node* right);
478 cat_node(const cat_node& x);
479 virtual ~cat_node(){}
481 node* clone() const BOOST_OVERRIDE;
482 bool nullable() const BOOST_OVERRIDE;
483 node_set firstpos() const BOOST_OVERRIDE;
484 node_set lastpos() const BOOST_OVERRIDE;
485 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
486 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
487 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
488 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
489 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
490 void dump(std::ostream& out) const BOOST_OVERRIDE;
495 #ifndef BOOST_NO_CXX11_SMART_PTR
496 std::unique_ptr<node> m_left;
497 std::unique_ptr<node> m_right;
499 std::auto_ptr<node> m_left;
500 std::auto_ptr<node> m_right;
505 cat_node::cat_node(node* left, node* right)
513 cat_node::cat_node(const cat_node& x)
515 , m_left(x.m_left->clone())
516 , m_right(x.m_right->clone())
521 cat_node::clone() const
523 return new cat_node(m_left->clone(), m_right->clone());
527 cat_node::nullable() const
529 return m_left->nullable() && m_right->nullable();
533 cat_node::firstpos() const
535 if (m_left->nullable())
538 node_set l = m_left->firstpos();
539 node_set r = m_right->firstpos();
540 std::set_union(l.begin(), l.end(), r.begin(), r.end(),
541 std::inserter(rval, rval.begin()));
546 return m_left->firstpos();
551 cat_node::lastpos() const
553 if (m_right->nullable())
556 node_set l = m_left->lastpos();
557 node_set r = m_right->lastpos();
558 std::set_union(l.begin(), l.end(), r.begin(), r.end(),
559 std::inserter(rval, rval.begin()));
564 return m_right->lastpos();
569 cat_node::compute_followpos(followpos_t& followpos) const
571 node_set l = m_left->lastpos();
572 for (node_set::iterator i = l.begin();
576 node_set rf = m_right->firstpos();
577 followpos[*i].insert(rf.begin(), rf.end());
580 m_left->compute_followpos(followpos);
581 m_right->compute_followpos(followpos);
585 cat_node::compute_state_match(state_match_t& state_match) const
587 m_left->compute_state_match(state_match);
588 m_right->compute_state_match(state_match);
592 cat_node::get_eof_ids(node_set& eof_nodes) const
594 m_left->get_eof_ids(eof_nodes);
595 m_right->get_eof_ids(eof_nodes);
599 cat_node::assign_node_ids(node_id_t& node_count)
601 m_left->assign_node_ids(node_count);
602 m_right->assign_node_ids(node_count);
605 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
607 cat_node::dump(std::ostream& out) const
612 out << " nullable() = " << (nullable() ? "true" : "false");
613 out << " firstpos() = ";
614 node_set fp = firstpos();
615 std::copy(fp.begin(), fp.end(),
616 std::ostream_iterator<node_id_t>(out, ","));
618 out << " lastpos() = ";
619 node_set lp = lastpos();
620 std::copy(lp.begin(), lp.end(),
621 std::ostream_iterator<node_id_t>(out, ","));
628 class star_node : public node
633 star_node(node* left);
634 star_node(const star_node& x);
635 virtual ~star_node(){}
637 node* clone() const BOOST_OVERRIDE;
638 bool nullable() const BOOST_OVERRIDE;
639 node_set firstpos() const BOOST_OVERRIDE;
640 node_set lastpos() const BOOST_OVERRIDE;
641 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
642 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
643 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
644 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
645 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
646 void dump(std::ostream& out) const BOOST_OVERRIDE;
651 #ifndef BOOST_NO_CXX11_SMART_PTR
652 std::unique_ptr<node> m_left;
654 std::auto_ptr<node> m_left;
659 star_node::star_node(node* left)
666 star_node::star_node(const star_node& x)
668 , m_left(x.m_left->clone())
673 star_node::clone() const
675 return new star_node(m_left->clone());
679 star_node::nullable() const
685 star_node::firstpos() const
687 return m_left->firstpos();
691 star_node::lastpos() const
693 return m_left->lastpos();
697 star_node::compute_followpos(followpos_t& followpos) const
699 node_set lp = this->lastpos();
700 for (node_set::iterator i = lp.begin();
704 node_set fp = this->firstpos();
705 followpos[*i].insert(fp.begin(), fp.end());
708 m_left->compute_followpos(followpos);
712 star_node::compute_state_match(state_match_t& state_match) const
714 m_left->compute_state_match(state_match);
718 star_node::get_eof_ids(node_set& eof_nodes) const
720 m_left->get_eof_ids(eof_nodes);
724 star_node::assign_node_ids(node_id_t& node_count)
726 m_left->assign_node_ids(node_count);
729 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
731 star_node::dump(std::ostream& out) const
735 out << "\nstar_node";
736 out << " nullable() = " << (nullable() ? "true" : "false");
737 out << " firstpos() = ";
738 node_set fp = firstpos();
739 std::copy(fp.begin(), fp.end(),
740 std::ostream_iterator<node_id_t>(out, ","));
742 out << " lastpos() = ";
743 node_set lp = lastpos();
744 std::copy(lp.begin(), lp.end(),
745 std::ostream_iterator<node_id_t>(out, ","));
751 class eof_node : public node
757 eof_node(const eof_node& x);
758 virtual ~eof_node(){}
760 node* clone() const BOOST_OVERRIDE;
761 bool nullable() const BOOST_OVERRIDE;
762 node_set firstpos() const BOOST_OVERRIDE;
763 node_set lastpos() const BOOST_OVERRIDE;
764 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
765 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
766 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
767 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
768 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
769 void dump(std::ostream& out) const BOOST_OVERRIDE;
774 node_id_t m_node_num;
785 eof_node::eof_node(const eof_node& x)
787 , m_node_num(x.m_node_num)
792 eof_node::clone() const
794 return new eof_node(*this);
798 eof_node::nullable() const
804 eof_node::firstpos() const
807 rval.insert(m_node_num);
812 eof_node::lastpos() const
815 rval.insert(m_node_num);
820 eof_node::compute_followpos(followpos_t&) const
826 eof_node::compute_state_match(state_match_t& state_match) const
828 if (state_match.size() < m_node_num + 1)
829 state_match.resize(m_node_num + 1);
830 state_match[m_node_num].resize(256, 0);
834 eof_node::get_eof_ids(node_set& eof_nodes) const
836 eof_nodes.insert(m_node_num);
840 eof_node::assign_node_ids(node_id_t& node_count)
842 m_node_num = node_count++;
845 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
847 eof_node::dump(std::ostream& out) const
850 out << " m_node_num = " << m_node_num;
851 out << " nullable() = " << (nullable() ? "true" : "false");
852 out << " firstpos() = ";
853 node_set fp = firstpos();
854 std::copy(fp.begin(), fp.end(),
855 std::ostream_iterator<node_id_t>(out, ","));
857 out << " lastpos() = ";
858 node_set lp = lastpos();
859 std::copy(lp.begin(), lp.end(),
860 std::ostream_iterator<node_id_t>(out, ","));
864 class ccl_node : public node
869 ccl_node(const std::vector<uchar>& v);
870 ccl_node(const uchar c1, const uchar c2);
871 ccl_node(const ccl_node& x);
872 virtual ~ccl_node(){}
874 node* clone() const BOOST_OVERRIDE;
875 bool nullable() const BOOST_OVERRIDE;
876 node_set firstpos() const BOOST_OVERRIDE;
877 node_set lastpos() const BOOST_OVERRIDE;
878 void compute_followpos(followpos_t& followpos) const BOOST_OVERRIDE;
879 void compute_state_match(state_match_t& state_match ) const BOOST_OVERRIDE;
880 void get_eof_ids(node_set& eof_set) const BOOST_OVERRIDE;
881 void assign_node_ids(node_id_t& node_count) BOOST_OVERRIDE;
882 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
883 void dump(std::ostream& out) const BOOST_OVERRIDE;
888 std::vector<uchar> m_match;
889 node_id_t m_node_num;
893 ccl_node::ccl_node(const std::vector<uchar>& v)
898 m_match.resize(256); // make sure it's the right size
902 ccl_node::ccl_node(const uchar c1, const uchar c2)
904 , m_match(256, uchar(0))
907 BOOST_ASSERT(c1 < c2);
908 for (std::size_t i = c1; i <= std::size_t(c2); ++i)
915 ccl_node::ccl_node(const ccl_node& x)
918 , m_node_num(x.m_node_num)
923 ccl_node::clone() const
925 return new ccl_node(*this);
929 ccl_node::nullable() const
935 ccl_node::firstpos() const
938 rval.insert(m_node_num);
943 ccl_node::lastpos() const
949 ccl_node::compute_followpos(followpos_t&) const
955 ccl_node::compute_state_match(state_match_t& state_match) const
957 if (state_match.size() < m_node_num + 1)
958 state_match.resize(m_node_num + 1);
959 state_match[m_node_num] = m_match;
963 ccl_node::get_eof_ids(node_set&) const
969 ccl_node::assign_node_ids(node_id_t& node_count)
971 m_node_num = node_count++;
974 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
976 ccl_node::dump(std::ostream& out) const
978 out << "\nccl_node m_match = ";
979 for (std::size_t i = 0; i < m_match.size(); ++i)
984 out << " m_node_num = " << m_node_num;
985 out << " nullable() = " << (nullable() ? "true" : "false");
986 out << " firstpos() = ";
987 node_set fp = firstpos();
988 std::copy(fp.begin(), fp.end(),
989 std::ostream_iterator<node_id_t>(out, ","));
991 out << " lastpos() = ";
992 node_set lp = lastpos();
993 std::copy(lp.begin(), lp.end(),
994 std::ostream_iterator<node_id_t>(out, ","));
998 template <typename ScannerT>
1001 typedef typename ScannerT::iterator_t iterator_type;
1005 make_concat(std::stack<node*>& the_stack)
1006 : m_stack(the_stack)
1009 void operator()(iterator_type const &, iterator_type const &) const
1011 node* right = m_stack.top();
1013 node* left = m_stack.top();
1015 node* newnode = new cat_node(left, right);
1016 m_stack.push(newnode);
1019 std::stack<node*>& m_stack;
1022 template <int CharTSize>
1023 struct get_byte_aux;
1026 struct get_byte_aux<1>
1028 template <typename CharT>
1029 unsigned char operator()(CharT c, unsigned int byte)
1031 BOOST_ASSERT(byte == 0);
1037 struct get_byte_aux<2>
1039 template <typename CharT>
1040 unsigned char operator()(CharT c, unsigned int byte)
1042 static unsigned long mask[] =
1048 BOOST_ASSERT(byte < 2);
1049 return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8);
1054 struct get_byte_aux<4>
1056 template <typename CharT>
1057 unsigned char operator()(CharT c, unsigned int byte)
1059 static unsigned long mask[] =
1067 BOOST_ASSERT(byte < 4);
1068 return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8);
1072 template <typename CharT>
1073 inline unsigned char
1074 get_byte(CharT c, unsigned int byte)
1076 return get_byte_aux<sizeof(c)>()(c, byte);
1079 template <typename ScannerT>
1082 typedef typename ScannerT::iterator_t iterator_type;
1086 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1089 make_star(std::stack<node*>& the_stack)
1090 : m_stack(the_stack)
1093 void operator()(const char_t) const
1095 node* left = m_stack.top();
1097 node* newnode = new star_node(left);
1098 m_stack.push(newnode);
1101 std::stack<node*>& m_stack;
1104 template <typename ScannerT>
1107 typedef typename ScannerT::iterator_t iterator_type;
1111 make_or(std::stack<node*>& the_stack)
1112 : m_stack(the_stack)
1115 void operator()(iterator_type const&, iterator_type const&) const
1117 node* right = m_stack.top();
1119 node* left = m_stack.top();
1121 node* newnode = new or_node(left, right);
1122 m_stack.push(newnode);
1125 std::stack<node*>& m_stack;
1128 template <typename ScannerT>
1131 typedef typename ScannerT::iterator_t iterator_type;
1135 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1138 make_plus(std::stack<node*>& the_stack)
1139 : m_stack(the_stack)
1142 void operator()(const char_t) const
1144 node* left = m_stack.top();
1147 node* copy = left->clone();
1149 node* new_star = new star_node(copy);
1150 node* new_cat = new cat_node(left, new_star);
1151 m_stack.push(new_cat);
1154 std::stack<node*>& m_stack;
1157 template <typename ScannerT>
1160 typedef typename ScannerT::iterator_t iterator_type;
1164 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1167 make_optional(std::stack<node*>& the_stack)
1168 : m_stack(the_stack)
1171 void operator()(const char_t) const
1173 node* left = m_stack.top();
1176 node* new_or = new or_node(left, new epsilon_node());
1177 m_stack.push(new_or);
1180 std::stack<node*>& m_stack;
1183 ///////////////////////////////////////////////////////////////////////////////
1185 template <typename CharT>
1186 inline utility::impl::range<CharT> const&
1189 static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(),
1190 (std::numeric_limits<CharT>::max)());
1196 template <typename char_t>
1197 inline utility::impl::range_run<char_t>
1199 const utility::impl::range_run<char_t>& rr)
1201 utility::impl::range_run<char_t> newrr;
1202 newrr.set(full_range<char_t>());
1203 for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
1204 iter != rr.end(); ++iter)
1209 template <typename char_t>
1211 create_mb_node_seq(char_t c)
1213 node* newnode = new char_node(get_byte(c, 0));
1214 for (unsigned int i = 1; i < sizeof(c); ++i)
1216 node* cnode = new char_node(get_byte(c, i));
1217 node* top_node = new cat_node(newnode, cnode);
1223 template <typename char_t>
1225 handle_mb_char(char_t c, bool first_time,
1226 std::stack<node*>& stack)
1228 node* newnode = create_mb_node_seq(c);
1232 stack.push(newnode);
1236 node* top = stack.top();
1239 node* newtop = new or_node(top, newnode);
1244 // forward decl only
1245 template <typename char_t>
1247 handle_mb_range(char_t c1, char_t c2, bool first_time,
1248 std::stack<node*>& stack);
1250 template <typename char_t>
1252 create_nodes(const utility::impl::range_run<char_t>& rr,
1253 std::stack<node*>& stack)
1256 if (sizeof(char_t) == 1)
1258 std::vector<uchar> ccl;
1260 for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
1261 iter != rr.end(); ++iter)
1263 for (int i = iter->first; i <= iter->last; ++i)
1265 // this is always true because of the limited datatype
1266 // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256);
1271 node* new_ccl = new ccl_node(ccl);
1272 stack.push(new_ccl);
1276 bool mb_first_time = true;
1277 for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
1278 iter != rr.end(); ++iter)
1280 if (iter->first == iter->last)
1282 handle_mb_char(iter->first, mb_first_time, stack);
1286 handle_mb_range(iter->first, iter->last, mb_first_time, stack);
1288 mb_first_time = false;
1293 template <typename char_t>
1295 compute_differing_byte(char_t c1, char_t c2)
1297 std::size_t rval = 0;
1298 while (rval < sizeof(c1) &&
1299 get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval))
1306 template <typename char_t>
1308 create_mb_node_type1(std::size_t j, char_t c1, char_t c2)
1310 std::size_t diff = get_byte(c2, (unsigned int)j) -
1311 get_byte(c1, (unsigned int)j);
1315 else if (diff == 2) {
1316 return new char_node(get_byte(c1, (unsigned int)j)+1);
1319 return new ccl_node(get_byte(c1, (unsigned int)j)+1,
1320 get_byte(c2, (unsigned int)j)-1);
1324 template <typename char_t>
1326 create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1,
1327 std::size_t differing_byte, char_t c1, char_t c2, node* newnode)
1330 if (i == sizem1 && j == differing_byte && j != sizem1)
1332 node* tmp = create_mb_node_type1(j, c1, c2);
1341 else if (i == differing_byte && j == sizem1)
1344 cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF);
1347 cnode = new ccl_node(get_byte(c1, (unsigned int)j),
1348 get_byte(c2, (unsigned int)j));
1351 else if (i != differing_byte && i != sizem1 &&
1352 j == (sizem1 - i + differing_byte))
1354 cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF);
1356 else if (i + j - differing_byte > sizem1) {
1357 cnode = new ccl_node(0, 0xFF);
1359 else {//if (is plain)
1360 cnode = new char_node(get_byte(c1, (unsigned int)j));
1363 node* top_node = new cat_node(newnode, cnode);
1367 // On platforms, where wchar_t is a typedef for unsigned short, the
1368 // comparision for a negative value is pointless
1369 template <bool is_signed>
1370 struct correct_char_aux {
1374 struct correct_char_aux<true> {
1376 template <typename char_t>
1377 static char_t correct(char_t c) { if (c < 0) c = 0; return c; }
1381 struct correct_char_aux<false> {
1383 template <typename char_t>
1384 static char_t correct(char_t c) { return c; }
1387 template <typename char_t>
1390 static char_t correct(char_t c)
1392 return correct_char_aux<std::numeric_limits<char_t>::is_signed >::
1397 template <typename char_t>
1399 handle_mb_range(char_t c1, char_t c2, bool first_time,
1400 std::stack<node*>& stack)
1402 // The algorithm can't handle negative value chars, which don't make
1403 // much sense anyway. This comparision is pointless for wchar_t's on
1404 // platforms, where wchar_t is a typedef for unsigned short
1406 c1 = correct_char<char_t>::correct(c1);
1410 BOOST_ASSERT(c1 < c2);
1412 node* savednode = 0;
1413 const std::size_t differing_byte = compute_differing_byte(c1, c2);
1414 const std::size_t sizem1 = sizeof(c1) - 1;
1415 const std::size_t ndb = sizem1 - differing_byte;
1416 for (std::size_t i = differing_byte; i < sizeof(c1); ++i)
1418 // generate node for the first byte
1419 if (differing_byte == 0 && i == ndb)
1421 node* tmp = create_mb_node_type1(0, c1, c2);
1429 newnode = new char_node(get_byte(c1, 0));
1431 for (std::size_t j = 1; j < sizeof(c1); ++j)
1433 newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte,
1439 // or together the various parts
1442 node* top_node = new or_node(savednode, newnode);
1443 savednode = top_node;
1447 savednode = newnode;
1453 for (std::size_t k = 0; k < ndb; ++k)
1455 newnode = new char_node(get_byte(c2, 0));
1456 for (std::size_t j = 1; j < sizeof(c2); ++j)
1459 if (k == differing_byte && j == sizem1 && k != sizem1)
1460 cnode = new ccl_node(0, get_byte(c2, (unsigned int)j));
1462 else if (k != differing_byte && k != sizem1 &&
1463 j == (sizem1 - k + differing_byte))
1464 cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1);
1466 else if (k + j - differing_byte > sizem1)
1467 cnode = new ccl_node(0, 0xFF);
1469 else //if (is plain)
1470 cnode = new char_node(get_byte(c2, (unsigned int)j));
1473 node* top_node = new cat_node(newnode, cnode);
1477 // or together the various parts
1480 node* top_node = new or_node(savednode, newnode);
1481 savednode = top_node;
1485 savednode = newnode;
1492 stack.push(savednode);
1496 node* top = stack.top();
1499 node* newtop = new or_node(top, savednode);
1503 } // namespace ccl_utils
1505 template <typename ScannerT>
1508 typedef typename ScannerT::iterator_t iterator_type;
1512 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1515 make_char(std::stack<node*>& the_stack)
1516 : m_stack(the_stack)
1519 void operator()(iterator_type const& first, iterator_type const& last) const
1521 const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
1522 escape_char_parser<lex_escapes, char_t>();
1524 iterator_type first_ = first;
1525 ScannerT scan(first_, last);
1526 lex_escape_ch[assign(the_char)].parse(scan);
1527 node* newnode = ccl_utils::create_mb_node_seq(the_char);
1528 m_stack.push(newnode);
1531 std::stack<node*>& m_stack;
1535 template <typename ScannerT>
1538 typedef typename ScannerT::iterator_t iterator_type;
1542 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1545 make_ccl(std::stack<node*>& the_stack)
1546 : m_stack(the_stack)
1549 static bool is_equal_to_string(iterator_type first,
1550 iterator_type const & last, const char* str)
1552 while (first != last &&*str &&*first ==*str)
1560 template <typename ParserT>
1561 static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser)
1563 for (int i = 0; i < 256; ++i)
1565 if (parser.test(static_cast<char_t>(uchar(i))))
1566 rr.set(utility::impl::range<char_t>(char_t(i), char_t(i)));
1570 void operator()(iterator_type const& first_, iterator_type const& last) const
1572 BOOST_ASSERT(*first_ == '[');
1574 iterator_type first = first_;
1575 ++first; // skip over '['
1576 bool negated_ccl = false;
1583 utility::impl::range_run<char_t> rr;
1584 while (first != last &&*first != ']')
1586 if (*first == '[') // it's a ccl_expr like [:space:]
1588 // check for [:space:], etc.
1589 if (is_equal_to_string(first, last, "[:alnum:]"))
1591 fill_ccl(rr, alnum_p);
1593 else if (is_equal_to_string(first, last, "[:alpha:]"))
1595 fill_ccl(rr, alpha_p);
1597 else if (is_equal_to_string(first, last, "[:blank:]"))
1599 fill_ccl(rr, blank_p);
1601 else if (is_equal_to_string(first, last, "[:cntrl:]"))
1603 fill_ccl(rr, cntrl_p);
1605 else if (is_equal_to_string(first, last, "[:digit:]"))
1607 fill_ccl(rr, digit_p);
1609 else if (is_equal_to_string(first, last, "[:graph:]"))
1611 fill_ccl(rr, graph_p);
1613 else if (is_equal_to_string(first, last, "[:lower:]"))
1615 fill_ccl(rr, lower_p);
1617 else if (is_equal_to_string(first, last, "[:print:]"))
1619 fill_ccl(rr, print_p);
1621 else if (is_equal_to_string(first, last, "[:punct:]"))
1623 fill_ccl(rr, punct_p);
1625 else if (is_equal_to_string(first, last, "[:space:]"))
1627 fill_ccl(rr, space_p);
1629 else if (is_equal_to_string(first, last, "[:upper:]"))
1631 fill_ccl(rr, upper_p);
1633 else if (is_equal_to_string(first, last, "[:xdigit:]"))
1635 fill_ccl(rr, xdigit_p);
1637 // this can't happen, because it's parsed before we get here.
1639 // throw bad_regex();
1641 // Advance past the character class expression
1642 while (first != last &&*first != ']')
1644 BOOST_ASSERT(*first == ']');
1648 const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
1649 escape_char_parser<lex_escapes, char_t>();
1652 ScannerT scan(first, last);
1653 lex_escape_ch[assign(c1)].parse(scan);
1654 if (*scan.first == '-') // insert a range
1658 lex_escape_ch[assign(c2)].parse(scan);
1659 BOOST_ASSERT(c1 < c2); // Throw exception?
1660 rr.set(utility::impl::range<char_t>(c1, c2));
1662 else // insert 1 char
1664 rr.set(utility::impl::range<char_t>(c1, c1));
1671 rr = ccl_utils::negate_range_run(rr);
1674 ccl_utils::create_nodes(rr, m_stack);
1677 std::stack<node*>& m_stack;
1680 template <typename ScannerT>
1683 typedef typename ScannerT::iterator_t iterator_type;
1687 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1690 std::stack<node*>& m_stack;
1692 make_any_char(std::stack<node*>& the_stack)
1693 : m_stack(the_stack)
1696 void operator()(const char_t c) const
1698 BOOST_ASSERT(c == '.');
1702 void do_any_char() const
1704 static utility::impl::range_run<char_t> rr;
1705 rr.set(full_range<char_t>());
1706 char_t newline = '\n';
1707 rr.clear(utility::impl::range<char_t>(newline, newline));
1709 ccl_utils::create_nodes(rr, m_stack);
1713 template <typename ScannerT>
1716 typedef typename ScannerT::iterator_t iterator_type;
1720 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1723 std::stack<node*>& m_stack;
1725 make_string(std::stack<node*>& the_stack)
1726 : m_stack(the_stack)
1729 void operator()(iterator_type const& first, iterator_type const& last) const
1731 BOOST_ASSERT(*first == '"');
1733 iterator_type first_ = first;
1734 ScannerT scan(first_, last);
1735 ++scan.first; // skip over '"'
1737 // empty string not allowed
1738 if (*scan.first == '"')
1740 boost::throw_exception(bad_regex());
1743 const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
1744 escape_char_parser<lex_escapes, char_t>();
1747 lex_escape_ch[assign(c)].parse(scan);
1748 node* top_node = ccl_utils::create_mb_node_seq(c);
1750 while (*scan.first != '"' && scan.first != scan.last)
1752 lex_escape_ch[assign(c)].parse(scan);
1753 node* cur_node = ccl_utils::create_mb_node_seq(c);
1754 top_node = new cat_node(top_node, cur_node);
1756 m_stack.push(top_node);
1761 node* repeat_node(node* n, int num)
1763 node* list_of_nodes = n;
1764 for (int i = 1; i < num; ++i)
1766 list_of_nodes = new cat_node(list_of_nodes, n->clone());
1768 return list_of_nodes;
1772 node* optional_node(node* n)
1774 return new or_node(n, new epsilon_node());
1777 template <typename ScannerT>
1780 typedef typename ScannerT::iterator_t iterator_type;
1784 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1787 std::stack<node*>& m_stack;
1789 make_rep1(std::stack<node*>& the_stack)
1790 : m_stack(the_stack)
1793 void operator()(iterator_type const& first, iterator_type const& last) const
1795 BOOST_ASSERT(*first == '{');
1797 iterator_type first_ = first;
1798 ScannerT scan(first_, last);
1799 ++scan.first; // skip over '{'
1802 uint_p[assign(count)].parse(scan);
1804 boost::throw_exception(bad_regex());
1806 node* top_node = m_stack.top();
1808 top_node = repeat_node(top_node, count);
1809 m_stack.push(top_node);
1813 template <typename ScannerT>
1816 typedef typename ScannerT::iterator_t iterator_type;
1820 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1823 std::stack<node*>& m_stack;
1825 make_rep2(std::stack<node*>& the_stack)
1826 : m_stack(the_stack)
1829 void operator()(iterator_type const& first, iterator_type const& last) const
1831 BOOST_ASSERT(*first == '{');
1833 iterator_type first_ = first;
1834 ScannerT scan (first_, last);
1835 ++scan.first; // skip over '{'
1838 uint_p[assign(count)].parse(scan);
1840 boost::throw_exception(bad_regex());
1842 node* top_node = m_stack.top();
1844 top_node = new cat_node(repeat_node(top_node, count),
1845 new star_node(top_node->clone()));
1846 m_stack.push(top_node);
1851 template <typename ScannerT>
1854 typedef typename ScannerT::iterator_t iterator_type;
1858 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1861 std::stack<node*>& m_stack;
1863 make_rep3(std::stack<node*>& the_stack)
1864 : m_stack(the_stack)
1867 void operator()(iterator_type const& first, iterator_type const& last) const
1869 BOOST_ASSERT(*first == '{');
1871 iterator_type first_ = first;
1872 ScannerT scan(first_, last);
1873 ++scan.first; // skip over '{'
1875 unsigned int count1, count2;
1876 uint_p[assign(count1)].parse(scan);
1878 boost::throw_exception(bad_regex());
1880 ++scan.first; // skip over ','
1882 uint_p[assign(count2)].parse(scan);
1883 if (count2 <= count1)
1884 boost::throw_exception(bad_regex());
1886 node* top_node = m_stack.top();
1888 node* repeats = repeat_node(top_node, count1);
1889 top_node = new cat_node(repeats,
1890 repeat_node(optional_node(top_node->clone()),
1893 m_stack.push(top_node);
1897 ///////////////////////////////////////////////////////////////////////////////
1901 // Defines the grammar, which mandates the syntax of the understood
1902 // lexeme definitions passed to lexer::register_regex.
1904 ///////////////////////////////////////////////////////////////////////////////
1905 class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar>
1908 lexer_grammar(std::stack<node*> &node_stack_)
1909 : node_stack(node_stack_) {}
1911 template <typename ScannerT>
1914 typedef rule<ScannerT> rule_t;
1915 typedef typename ScannerT::iterator_t iterator_type;
1917 typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
1920 rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string,
1924 definition(lexer_grammar const &self)
1927 re >> !('/' >> re) >> !ch_p('$')
1932 >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] )
1937 >>*( singleton[make_concat<ScannerT>(self.node_stack)] )
1941 ch_p('.')[make_any_char<ScannerT>(self.node_stack)]
1945 | ('"' >> string >> '"')
1947 make_string<ScannerT>(self.node_stack)
1952 | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq)
1954 make_char<ScannerT>(self.node_stack)
1960 ch_p('*')[make_star<ScannerT>(self.node_stack)]
1962 | ch_p('+')[make_plus<ScannerT>(self.node_stack)]
1964 | ch_p('?')[make_optional<ScannerT>(self.node_stack)]
1966 | ('{' >> uint_p >> '}')
1968 make_rep1<ScannerT>(self.node_stack)
1971 | ('{' >> uint_p >> ',' >> '}')
1973 make_rep2<ScannerT>(self.node_stack)
1976 | ('{' >> uint_p >> ',' >> uint_p >> '}')
1978 make_rep3<ScannerT>(self.node_stack)
1985 ('[' >> !ch_p('^') >> ccl >> ']')
1987 make_ccl<ScannerT>(self.node_stack)
1992 *(ccl_expr | (ccl_char >> !('-' >> ccl_char)))
1996 ( (anychar_p - chset<>("\\\n]")) | escseq )
2015 +( (anychar_p - chset<>("\"\\")) | escseq )
2019 uint_parser<char_t, 8, 1,
2020 std::numeric_limits<char_t>::digits / 3 + 1
2023 uint_parser<char_t, 16, 1,
2024 std::numeric_limits<char_t>::digits / 4 + 1
2031 | as_lower_d['x'] >> hex_parser_t()
2032 | (anychar_p - chset<>('\n'))
2036 #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2038 BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG);
2039 BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG);
2040 BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG);
2041 BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG);
2042 BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG);
2043 BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG);
2044 BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG);
2045 BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG);
2046 BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG);
2047 BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG);
2049 #undef BOOST_SLEX_DEBUG
2052 rule<ScannerT> const&
2053 start() const { return regex; }
2056 std::stack<node*> &node_stack;
2058 }; // class lexer_grammar
2060 template <typename StringT>
2062 parse(lexer_grammar& g, StringT const& str)
2065 scanner<typename StringT::const_iterator, scanner_policies<> >
2067 typedef typename rule<scanner_t>::template result<scanner_t>::type
2070 typename StringT::const_iterator first = str.begin();
2071 typename StringT::const_iterator last = str.end();
2073 scanner_t scan(first, last);
2074 // typename rule<scanner_t>::result_t hit = g.parse(scan);
2075 result_t hit = g.parse(scan);
2076 if (!hit || !scan.at_end())
2078 while (g.node_stack.size())
2080 delete g.node_stack.top();
2086 BOOST_ASSERT(g.node_stack.size() == 1);
2087 node* rval = g.node_stack.top();
2089 node* an_eof_node = new eof_node();
2090 rval = new cat_node(rval, an_eof_node);
2095 void make_case_insensitive(state_match_t& state_match)
2098 // This doesn't take into account foreign languages, figure out how to
2099 // do that. Also this approach is broken for this implementation of
2101 for (state_match_t::iterator iter = state_match.begin();
2102 iter != state_match.end(); ++iter)
2105 for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j)
2107 // if either is set, turn them both on
2108 (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]);
2113 template<bool wide_char>
2114 struct regex_match_helper;
2117 struct regex_match_helper<false> // single byte char
2119 template <typename DfaT, typename IteratorT>
2121 do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
2124 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2127 typedef std::basic_string<
2128 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2130 typedef typename string_type::size_type size_type;
2133 node_id_t last_accepting_index = invalid_node;
2134 IteratorT p = first;
2135 IteratorT last_accepting_cpos = first;
2138 s = dfa.transition_table[s][(uchar)*p];
2139 if (s == invalid_node)
2141 if (token) token->append((size_type)1, *p);
2143 if (dfa.acceptance_index[s] != invalid_node)
2145 last_accepting_index = s;
2146 last_accepting_cpos = p;
2149 if (last_accepting_index != invalid_node)
2151 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2152 std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
2153 dfa.acceptance_index[last_accepting_index] << '\n';
2156 first = last_accepting_cpos;
2157 regex_index = dfa.acceptance_index[last_accepting_index];
2166 struct regex_match_helper<true> // wide char
2168 template <typename DfaT, typename IteratorT>
2170 do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
2173 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2177 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2179 typedef std::basic_string<char_t> string_type;
2180 typedef typename string_type::size_type size_type;
2183 node_id_t last_accepting_index = invalid_node;
2184 IteratorT wp = first;
2185 IteratorT last_accepting_cpos = first;
2189 for (unsigned int i = 0; i < sizeof(char_t); ++i)
2191 s = dfa.transition_table[s][get_byte(*wp,i)];
2192 if (s == invalid_node)
2197 if (token) token->append((size_type)1, *wp);
2199 if (dfa.acceptance_index[s] != invalid_node)
2201 last_accepting_index = s;
2202 last_accepting_cpos = wp;
2208 if (last_accepting_index != invalid_node)
2210 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2211 std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
2212 dfa.acceptance_index[last_accepting_index] << '\n';
2214 first = last_accepting_cpos;
2215 regex_index = dfa.acceptance_index[last_accepting_index];
2224 template <typename DfaT, typename IteratorT, bool wide_char>
2228 do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
2231 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2234 return regex_match_helper<wide_char>::do_match(
2235 dfa, first, last, regex_index, token);
2239 } // namespace lexerimpl
2241 ///////////////////////////////////////////////////////////////////////////////
2243 template <typename IteratorT = char const*, typename TokenT = int,
2244 typename CallbackT = void(*)(IteratorT const &,
2248 lexer_control<TokenT> &)>
2252 typedef CallbackT callback_t;
2254 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2259 std::basic_string<char_t> str;
2263 regex_info(const std::basic_string<char_t>& _str,
2264 const TokenT& _token,
2265 const CallbackT& _callback)
2268 , callback(_callback)
2275 std::vector<std::vector<node_id_t> > transition_table;
2276 std::vector<node_id_t> acceptance_index;
2278 typedef std::vector<node_id_t> node_table_t;
2279 typedef std::vector<node_table_t> transition_table_t;
2280 typedef std::vector<dfa_table> dfa_t;
2283 lexer(unsigned int states = 1);
2285 void register_regex(const std::basic_string<char_t>& regex,
2286 const TokenT& id, const CallbackT& cb = CallbackT(),
2287 unsigned int state = 0);
2289 TokenT next_token(IteratorT &first, IteratorT const &last,
2290 std::basic_string<char_t> *token = 0);
2293 bool has_compiled_dfa() { return m_compiled_dfa; }
2295 void set_case_insensitive(bool insensitive);
2297 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2298 void dump(std::ostream& out);
2300 typedef std::vector<std::vector<regex_info> > regex_list_t;
2302 bool load (std::ifstream &in, long unique_id = 0);
2303 bool save (std::ofstream &out, long unique_id = 0);
2305 SLEX_SIGNATURE = 0x58454C53, // "SLEX"
2306 SLEX_VERSION_100 = 0x0100, // persistance version
2307 SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100,
2308 SLEX_MINOR_VERSION_MASK = 0xFF
2313 void create_dfa_for_state(int state);
2315 static bool regex_match(const dfa_t& dfa, IteratorT& first,
2316 IteratorT& last, int& regex_index);
2318 mutable std::stack<lexerimpl::node*> node_stack;
2319 lexerimpl::lexer_grammar g;
2321 mutable bool m_compiled_dfa;
2322 mutable dfa_t m_dfa;
2324 regex_list_t m_regex_list;
2325 bool m_case_insensitive;
2327 unsigned int m_state;
2328 std::stack<unsigned int> m_state_stack;
2329 unsigned int m_num_states;
2333 template <typename IteratorT, typename TokenT, typename CallbackT>
2335 lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states)
2337 , m_compiled_dfa(false)
2338 , m_regex_list(states)
2339 , m_case_insensitive(false)
2342 , m_num_states(states)
2344 BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer",
2345 BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);
2348 template <typename IteratorT, typename TokenT, typename CallbackT>
2350 lexer<IteratorT, TokenT, CallbackT>::register_regex(
2351 const std::basic_string<char_t>& regex, const TokenT& id,
2352 const CallbackT& callback, unsigned int state)
2354 if (state > m_num_states) {
2355 m_regex_list.resize(state);
2356 m_num_states = state;
2358 m_regex_list[state].push_back(regex_info(regex, id, callback));
2361 template <typename IteratorT, typename TokenT, typename CallbackT>
2363 lexer<IteratorT, TokenT, CallbackT>::next_token(
2364 IteratorT &first, IteratorT const& last,
2366 typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
2369 if (!m_compiled_dfa)
2374 IteratorT saved = first;
2376 if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>::
2377 do_match(m_dfa[m_state], first, last, regex_index, token))
2378 return -1; // TODO: can't return -1, need to return some invalid token.
2379 // how to figure this out? We can use traits I guess.
2382 regex_info regex = m_regex_list[m_state][regex_index];
2383 TokenT rval = regex.token;
2386 // execute corresponding callback
2387 lexer_control<TokenT> controller(rval, m_state, m_state_stack);
2388 regex.callback(saved, first, last, regex.token, controller);
2389 if (controller.ignore_current_token_set()) {
2392 return next_token(first, last, token);
2403 bool find_acceptance_state(const node_set& eof_node_ids,
2404 const node_set& current_set,
2405 node_id_t& acceptance_node_id)
2407 for(node_set::const_iterator nsi = eof_node_ids.begin();
2408 nsi != eof_node_ids.end(); ++nsi)
2410 node_id_t eof_node_id =*nsi;
2411 if (current_set.end() != current_set.find(eof_node_id))
2413 // store the first one we come to as the
2415 acceptance_node_id = eof_node_id;
2416 // don't bother searching for more
2423 template <typename RegexListT, typename GrammarT>
2424 #ifndef BOOST_NO_CXX11_SMART_PTR
2425 inline std::unique_ptr<node>
2427 inline std::auto_ptr<node>
2429 parse_regexes(const RegexListT& regex_list, GrammarT& g)
2431 // parse the expressions into a tree
2432 if (regex_list.begin() == regex_list.end())
2433 boost::throw_exception(bad_regex());
2435 typename RegexListT::const_iterator ri = regex_list.begin();
2436 #ifndef BOOST_NO_CXX11_SMART_PTR
2437 std::unique_ptr<node> tree(lexerimpl::parse(g, (*ri).str));
2439 std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str));
2441 if (tree.get() == 0)
2442 boost::throw_exception(bad_regex());
2445 for (/**/; ri != regex_list.end(); ++ri)
2447 #ifndef BOOST_NO_CXX11_SMART_PTR
2448 std::unique_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str));
2450 std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str));
2452 if (next_tree.get() == 0)
2453 boost::throw_exception(bad_regex());
2454 #ifndef BOOST_NO_CXX11_SMART_PTR
2455 tree = std::unique_ptr<node>(new or_node(tree.release(), next_tree.release()));
2457 tree = std::auto_ptr<node>(new or_node(tree.release(), next_tree.release()));
2463 } //namespace lexerimpl
2465 template <typename IteratorT, typename TokenT, typename CallbackT>
2467 lexer<IteratorT, TokenT, CallbackT>::create_dfa()
2469 m_dfa.resize(m_num_states);
2470 for (unsigned int i = 0; i < m_num_states; ++i)
2471 create_dfa_for_state(i);
2474 // Algorithm from Compilers: Principles, Techniques, and Tools p. 141
2475 template <typename IteratorT, typename TokenT, typename CallbackT>
2477 lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state)
2479 using lexerimpl::node;
2480 #ifndef BOOST_NO_CXX11_SMART_PTR
2481 std::unique_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g);
2483 std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g);
2485 node_id_t dummy = 0;
2486 tree->assign_node_ids(dummy);
2488 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2489 tree->dump(std::cout);
2492 // compute followpos(root)
2493 followpos_t followpos;
2494 tree->compute_followpos(followpos);
2496 // the dfa states <-> nfa state groups
2497 std::map<node_set, node_id_t> dstates1;
2498 std::map<node_id_t, node_set> dstates2;
2500 // the dfa transitions
2501 m_dfa[state].transition_table.push_back(
2502 std::vector<node_id_t>(256, invalid_node));
2503 m_dfa[state].acceptance_index.push_back(invalid_node);
2505 // whether the dfa state has been processed yet
2506 std::vector<node_id_t> marked;
2508 // used to give a unique id to each dfa state
2509 node_id_t num_states = 0;
2511 // initially, the only unmarked state in Dstates is firstpos(root).
2512 marked.push_back(0);
2513 node_set fpr = tree->firstpos();
2516 state_match_t state_match;
2517 tree->compute_state_match(state_match);
2519 if (m_case_insensitive)
2520 lexerimpl::make_case_insensitive(state_match);
2522 node_set eof_node_ids;
2523 tree->get_eof_ids(eof_node_ids);
2524 // translate the eof_node_ids into a 0-based index
2525 std::map<node_id_t, node_id_t> eof_node_id_map;
2527 for (node_set::iterator node_id_it = eof_node_ids.begin();
2528 node_id_it != eof_node_ids.end();
2531 eof_node_id_map[*node_id_it] = x++;
2534 // figure out if this is an acceptance state
2535 node_id_t eof_node_id;
2536 if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id))
2538 m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id];
2541 std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(),
2543 while (marked.end() != i)
2546 node_id_t T = node_id_t(std::distance(marked.begin(), i));
2547 BOOST_ASSERT(T < dstates2.size());
2548 node_set Tstates = dstates2[T];
2549 for (node_id_t j = 0; j < 256; ++j)
2552 for (node_set::iterator k = Tstates.begin();
2553 k != Tstates.end(); ++k)
2556 BOOST_ASSERT(p < state_match.size());
2557 BOOST_ASSERT(j < state_match[p].size());
2558 if (state_match[p][j])
2560 node_set fpp = followpos[p];
2561 U.insert(fpp.begin(), fpp.end());
2566 std::map<node_set, node_id_t>::iterator l = dstates1.find(U);
2567 node_id_t target_state;
2568 if (l == dstates1.end()) // not in the states yet
2571 dstates1[U] = target_state = num_states;
2572 dstates2[target_state] = U;
2573 marked.push_back(0);
2574 m_dfa[state].transition_table.push_back(
2575 std::vector<node_id_t>(256, invalid_node));
2576 m_dfa[state].acceptance_index.push_back(invalid_node);
2577 // figure out if this is an acceptance state
2578 node_id_t eof_node_id;
2579 if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id))
2581 m_dfa[state].acceptance_index[target_state] =
2582 eof_node_id_map[eof_node_id];
2587 target_state = dstates1[U];
2590 BOOST_ASSERT(T < m_dfa[state].transition_table.size());
2591 BOOST_ASSERT(j < m_dfa[state].transition_table[T].size());
2592 m_dfa[state].transition_table[T][j] = target_state;
2597 i = std::find(marked.begin(), marked.end(), node_id_t(0));
2599 m_compiled_dfa = true;
2602 template <typename IteratorT, typename TokenT, typename CallbackT>
2604 lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive)
2606 m_case_insensitive = insensitive;
2610 #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
2611 template <typename IteratorT, typename TokenT, typename CallbackT>
2613 lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out)
2615 for (unsigned x = 0; x < m_dfa.size(); ++x)
2617 out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n";
2618 for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i)
2620 out << "state " << i << ":";
2621 for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j)
2623 if (m_dfa[x].transition_table[i][j] != invalid_node)
2624 out << j << "->" << m_dfa[x].transition_table[i][j] << " ";
2628 out << "acceptance states: ";
2629 for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k)
2631 if (m_dfa[x].acceptance_index[k] != invalid_node)
2632 out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> ";
2639 ///////////////////////////////////////////////////////////////////////////////
2640 // load the lexer tables
2641 #define slex_in(strm, val) \
2642 strm.read((char*)&val, sizeof(val)); \
2643 if (std::ios::goodbit != strm.rdstate()) return false
2645 template <typename IteratorT, typename TokenT, typename CallbackT>
2647 lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id)
2649 // ensure correct signature and version
2652 slex_in (in, signature);
2653 if (signature != SLEX_SIGNATURE)
2654 return false; // not for us
2658 slex_in (in, version);
2659 if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION)
2660 return false; // to new for us
2665 if (uid != unique_id)
2666 return false; // not saved by us
2668 // load auxiliary members
2671 slex_in (in, num_states);
2673 // load the dfa tables
2675 std::size_t dfa_size = 0;
2677 slex_in (in, dfa_size);
2678 in_dfa.resize(dfa_size);
2679 for (std::size_t dfa = 0; dfa < dfa_size; ++dfa)
2681 // load the transition tables
2682 std::size_t tt_size = 0;
2683 transition_table_t &tt_table = in_dfa[dfa].transition_table;
2685 slex_in (in, tt_size);
2686 tt_table.resize(tt_size);
2687 for (std::size_t tt = 0; tt < tt_size; ++tt)
2689 std::size_t nt_size = 0;
2690 node_table_t &nt_table = tt_table[tt];
2692 slex_in (in, nt_size);
2693 nt_table.resize(nt_size);
2694 for (std::size_t nt = 0; nt < nt_size; ++nt)
2696 slex_in (in, nt_table[nt]);
2700 // load the acceptance index table
2701 std::size_t ai_size = 0;
2702 node_table_t &ai_table = in_dfa[dfa].acceptance_index;
2704 slex_in (in, ai_size);
2705 ai_table.resize(ai_size);
2706 for (std::size_t ai = 0; ai < ai_size; ++ai)
2708 slex_in (in, ai_table[ai]);
2712 m_dfa.swap(in_dfa); // success, swap in the read values
2713 m_num_states = num_states;
2715 m_compiled_dfa = true;
2721 ///////////////////////////////////////////////////////////////////////////////
2722 // save the lexer tables
2723 #define slex_out(strm, val) \
2724 strm.write((char*)&val, sizeof(val)); \
2725 if (std::ios::goodbit != strm.rdstate()) return false
2727 template <typename IteratorT, typename TokenT, typename CallbackT>
2729 lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id)
2731 // save signature and version information
2732 long out_long = SLEX_SIGNATURE;
2734 slex_out(out, out_long);
2735 out_long = SLEX_VERSION_100;
2736 slex_out(out, out_long);
2737 slex_out(out, unique_id);
2739 // save auxiliary members
2740 slex_out(out, m_num_states);
2742 // save the dfa tables
2743 typedef typename dfa_t::const_iterator dfa_iter_t;
2744 typedef transition_table_t::const_iterator transition_table_iter_t;
2745 typedef node_table_t::const_iterator node_table_iter_t;
2747 std::size_t out_size_t = m_dfa.size();
2748 slex_out(out, out_size_t);
2750 dfa_iter_t end = m_dfa.end();
2751 for (dfa_iter_t it = m_dfa.begin(); it != end; ++it)
2753 // save the transition table
2754 out_size_t = (*it).transition_table.size();
2755 slex_out(out, out_size_t);
2757 transition_table_iter_t tt_end = (*it).transition_table.end();
2758 for (transition_table_iter_t tt_it = (*it).transition_table.begin();
2762 out_size_t = (*tt_it).size();
2763 slex_out(out, out_size_t);
2765 node_table_iter_t nt_end = (*tt_it).end();
2766 for (node_table_iter_t nt_it = (*tt_it).begin();
2770 slex_out(out, (*nt_it));
2774 // save the acceptance index table
2775 out_size_t = (*it).acceptance_index.size();
2776 slex_out(out, out_size_t);
2778 node_table_iter_t nt_end = (*it).acceptance_index.end();
2779 for (node_table_iter_t nt_it = (*it).acceptance_index.begin();
2783 slex_out(out, (*nt_it));
2792 a lexer_control object supports some operations on the lexer.
2793 get current lexer state
2796 state stack (push, pop, top)
2797 set new token return value
2798 ignore the current token
2800 get length of matched token
2802 template <typename TokenT>
2807 lexer_control(TokenT& token, unsigned int& current_state,
2808 std::stack<unsigned int>& state_stack);
2809 // functions dealing with the lexer state
2811 // set the state to state
2812 void set_state(unsigned int state);
2814 // get the current state
2815 unsigned int get_state();
2817 // pushes the current state onto the top of the state stack and
2818 // switches to new_state
2819 void push_state(unsigned int new_state);
2821 // pops the top of the state stack and switches to it.
2824 // returns the top of the stack without altering the stack's contents.
2825 unsigned int top_state();
2827 // functions dealing with the token returned.
2829 // set a new TokenT return value, overriding that one that was
2830 // registered via. register_regex()
2831 void set_token(const TokenT& token);
2833 // tell the lexer to return an invalid token, signifying termination.
2836 // ignore the current token, and move on to the next one. The current
2837 // token will NOT be returned from next_token()
2838 void ignore_current_token();
2840 // returns true if ignore_current_token() has been called,
2842 bool ignore_current_token_set();
2846 bool m_ignore_current_token;
2847 unsigned int& m_current_state;
2848 std::stack<unsigned int>& m_state_stack;
2851 template <typename TokenT>
2853 lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state,
2854 std::stack<unsigned int>& state_stack)
2856 , m_ignore_current_token(false)
2857 , m_current_state(current_state)
2858 , m_state_stack(state_stack)
2862 template <typename TokenT>
2864 lexer_control<TokenT>::set_state(unsigned int state)
2866 m_current_state = state;
2869 template <typename TokenT>
2871 lexer_control<TokenT>::get_state()
2873 return m_current_state;
2876 template <typename TokenT>
2878 lexer_control<TokenT>::push_state(unsigned int new_state)
2880 m_state_stack.push(m_current_state);
2881 m_current_state = new_state;
2884 template <typename TokenT>
2886 lexer_control<TokenT>::pop_state()
2888 m_current_state = m_state_stack.top();
2889 m_state_stack.pop();
2892 template <typename TokenT>
2894 lexer_control<TokenT>::top_state()
2896 return m_state_stack.top();
2899 template <typename TokenT>
2901 lexer_control<TokenT>::set_token(const TokenT& token)
2906 template <typename TokenT>
2908 lexer_control<TokenT>::terminate()
2910 m_token = -1; // TOOD: fix this, need to be determined by traits
2913 template <typename TokenT>
2915 lexer_control<TokenT>::ignore_current_token()
2917 m_ignore_current_token = true;
2920 template <typename TokenT>
2922 lexer_control<TokenT>::ignore_current_token_set()
2924 return m_ignore_current_token;
2927 } // namespace classic
2928 } // namespace spirit
2929 } // namespace boost
2931 #undef BOOST_SPIRIT_IT_NS