]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////////////////////////////////////////////////////////////////////////// |
2 | /// \file regex_token_iterator.hpp | |
3 | /// Contains the definition of regex_token_iterator, and STL-compatible iterator | |
4 | /// for tokenizing a string using a regular expression. | |
5 | // | |
6 | // Copyright 2008 Eric Niebler. Distributed under the Boost | |
7 | // Software License, Version 1.0. (See accompanying file | |
8 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
9 | ||
10 | #ifndef BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005 | |
11 | #define BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005 | |
12 | ||
13 | // MS compatible compilers support #pragma once | |
14 | #if defined(_MSC_VER) | |
15 | # pragma once | |
16 | #endif | |
17 | ||
18 | #include <vector> | |
19 | #include <boost/assert.hpp> | |
20 | #include <boost/mpl/assert.hpp> | |
21 | #include <boost/type_traits/is_same.hpp> | |
22 | #include <boost/type_traits/is_convertible.hpp> | |
23 | #include <boost/xpressive/regex_iterator.hpp> | |
24 | ||
25 | namespace boost { namespace xpressive { namespace detail | |
26 | { | |
27 | ||
28 | ////////////////////////////////////////////////////////////////////////// | |
29 | // regex_token_iterator_impl | |
30 | // | |
31 | template<typename BidiIter> | |
32 | struct regex_token_iterator_impl | |
33 | : counted_base<regex_token_iterator_impl<BidiIter> > | |
34 | { | |
35 | typedef sub_match<BidiIter> value_type; | |
36 | ||
37 | regex_token_iterator_impl | |
38 | ( | |
39 | BidiIter begin | |
40 | , BidiIter cur | |
41 | , BidiIter end | |
42 | , BidiIter next_search | |
43 | , basic_regex<BidiIter> const &rex | |
44 | , regex_constants::match_flag_type flags = regex_constants::match_default | |
45 | , std::vector<int> subs = std::vector<int>(1, 0) | |
46 | , int n = -2 | |
47 | , bool not_null = false | |
48 | ) | |
49 | : iter_(begin, cur, end, next_search, rex, flags, not_null) | |
50 | , result_() | |
51 | , n_((-2 == n) ? (int)subs.size() - 1 : n) | |
52 | , subs_() | |
53 | { | |
54 | BOOST_ASSERT(0 != subs.size()); | |
55 | this->subs_.swap(subs); | |
56 | } | |
57 | ||
58 | bool next() | |
59 | { | |
60 | if(-1 != this->n_) | |
61 | { | |
62 | BidiIter cur = this->iter_.state_.cur_; | |
63 | if(0 != (++this->n_ %= (int)this->subs_.size()) || this->iter_.next()) | |
64 | { | |
65 | this->result_ = (-1 == this->subs_[ this->n_ ]) | |
66 | ? this->iter_.what_.prefix() | |
67 | : this->iter_.what_[ this->subs_[ this->n_ ] ]; | |
68 | return true; | |
69 | } | |
70 | else if(-1 == this->subs_[ this->n_-- ] && cur != this->iter_.state_.end_) | |
71 | { | |
72 | this->result_ = value_type(cur, this->iter_.state_.end_, true); | |
73 | return true; | |
74 | } | |
75 | } | |
76 | ||
77 | return false; | |
78 | } | |
79 | ||
80 | bool equal_to(regex_token_iterator_impl<BidiIter> const &that) const | |
81 | { | |
82 | return this->iter_.equal_to(that.iter_) && this->n_ == that.n_; | |
83 | } | |
84 | ||
85 | regex_iterator_impl<BidiIter> iter_; | |
86 | value_type result_; | |
87 | int n_; | |
88 | std::vector<int> subs_; | |
89 | }; | |
90 | ||
91 | inline int get_mark_number(int i) | |
92 | { | |
93 | return i; | |
94 | } | |
95 | ||
96 | inline std::vector<int> to_vector(int subs) | |
97 | { | |
98 | return std::vector<int>(1, subs); | |
99 | } | |
100 | ||
101 | inline std::vector<int> const &to_vector(std::vector<int> const &subs) | |
102 | { | |
103 | return subs; | |
104 | } | |
105 | ||
106 | template<typename Int, std::size_t Size> | |
107 | inline std::vector<int> to_vector(Int const (&sub_matches)[ Size ]) | |
108 | { | |
109 | // so that people can specify sub-match indices inline with | |
110 | // string literals, like "\1\2\3", leave off the trailing '\0' | |
111 | std::size_t const size = Size - is_same<Int, char>::value; | |
112 | std::vector<int> vect(size); | |
113 | for(std::size_t i = 0; i < size; ++i) | |
114 | { | |
115 | vect[i] = get_mark_number(sub_matches[i]); | |
116 | } | |
117 | return vect; | |
118 | } | |
119 | ||
120 | template<typename Int> | |
121 | inline std::vector<int> to_vector(std::vector<Int> const &sub_matches) | |
122 | { | |
123 | BOOST_MPL_ASSERT((is_convertible<Int, int>)); | |
124 | return std::vector<int>(sub_matches.begin(), sub_matches.end()); | |
125 | } | |
126 | ||
127 | } // namespace detail | |
128 | ||
129 | ////////////////////////////////////////////////////////////////////////// | |
130 | // regex_token_iterator | |
131 | // | |
132 | template<typename BidiIter> | |
133 | struct regex_token_iterator | |
134 | { | |
135 | typedef basic_regex<BidiIter> regex_type; | |
136 | typedef typename iterator_value<BidiIter>::type char_type; | |
137 | typedef sub_match<BidiIter> value_type; | |
138 | typedef std::ptrdiff_t difference_type; | |
139 | typedef value_type const *pointer; | |
140 | typedef value_type const &reference; | |
141 | typedef std::forward_iterator_tag iterator_category; | |
142 | ||
143 | /// INTERNAL ONLY | |
144 | typedef detail::regex_token_iterator_impl<BidiIter> impl_type_; | |
145 | ||
146 | /// \post \c *this is the end of sequence iterator. | |
147 | regex_token_iterator() | |
148 | : impl_() | |
149 | { | |
150 | } | |
151 | ||
152 | /// \param begin The beginning of the character range to search. | |
153 | /// \param end The end of the character range to search. | |
154 | /// \param rex The regex pattern to search for. | |
155 | /// \pre \c [begin,end) is a valid range. | |
156 | regex_token_iterator | |
157 | ( | |
158 | BidiIter begin | |
159 | , BidiIter end | |
160 | , basic_regex<BidiIter> const &rex | |
161 | ) | |
162 | : impl_() | |
163 | { | |
164 | if(0 != rex.regex_id()) | |
165 | { | |
166 | this->impl_ = new impl_type_(begin, begin, end, begin, rex); | |
167 | this->next_(); | |
168 | } | |
169 | } | |
170 | ||
171 | /// \param begin The beginning of the character range to search. | |
172 | /// \param end The end of the character range to search. | |
173 | /// \param rex The regex pattern to search for. | |
174 | /// \param args A let() expression with argument bindings for semantic actions. | |
175 | /// \pre \c [begin,end) is a valid range. | |
176 | template<typename LetExpr> | |
177 | regex_token_iterator | |
178 | ( | |
179 | BidiIter begin | |
180 | , BidiIter end | |
181 | , basic_regex<BidiIter> const &rex | |
182 | , detail::let_<LetExpr> const &args | |
183 | ) | |
184 | : impl_() | |
185 | { | |
186 | if(0 != rex.regex_id()) | |
187 | { | |
188 | this->impl_ = new impl_type_(begin, begin, end, begin, rex); | |
189 | detail::bind_args(args, this->impl_->iter_.what_); | |
190 | this->next_(); | |
191 | } | |
192 | } | |
193 | ||
194 | /// \param begin The beginning of the character range to search. | |
195 | /// \param end The end of the character range to search. | |
196 | /// \param rex The regex pattern to search for. | |
197 | /// \param subs A range of integers designating sub-matches to be treated as tokens. | |
198 | /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.) | |
199 | /// \pre \c [begin,end) is a valid range. | |
200 | /// \pre \c subs is either an integer greater or equal to -1, | |
201 | /// or else an array or non-empty \c std::vector\<\> of such integers. | |
202 | template<typename Subs> | |
203 | regex_token_iterator | |
204 | ( | |
205 | BidiIter begin | |
206 | , BidiIter end | |
207 | , basic_regex<BidiIter> const &rex | |
208 | , Subs const &subs | |
209 | , regex_constants::match_flag_type flags = regex_constants::match_default | |
210 | ) | |
211 | : impl_() | |
212 | { | |
213 | if(0 != rex.regex_id()) | |
214 | { | |
215 | this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs)); | |
216 | this->next_(); | |
217 | } | |
218 | } | |
219 | ||
220 | /// \param begin The beginning of the character range to search. | |
221 | /// \param end The end of the character range to search. | |
222 | /// \param rex The regex pattern to search for. | |
223 | /// \param subs A range of integers designating sub-matches to be treated as tokens. | |
224 | /// \param args A let() expression with argument bindings for semantic actions. | |
225 | /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.) | |
226 | /// \pre \c [begin,end) is a valid range. | |
227 | /// \pre \c subs is either an integer greater or equal to -1, | |
228 | /// or else an array or non-empty \c std::vector\<\> of such integers. | |
229 | template<typename Subs, typename LetExpr> | |
230 | regex_token_iterator | |
231 | ( | |
232 | BidiIter begin | |
233 | , BidiIter end | |
234 | , basic_regex<BidiIter> const &rex | |
235 | , Subs const &subs | |
236 | , detail::let_<LetExpr> const &args | |
237 | , regex_constants::match_flag_type flags = regex_constants::match_default | |
238 | ) | |
239 | : impl_() | |
240 | { | |
241 | if(0 != rex.regex_id()) | |
242 | { | |
243 | this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs)); | |
244 | detail::bind_args(args, this->impl_->iter_.what_); | |
245 | this->next_(); | |
246 | } | |
247 | } | |
248 | ||
249 | /// \post <tt>*this == that</tt> | |
250 | regex_token_iterator(regex_token_iterator<BidiIter> const &that) | |
251 | : impl_(that.impl_) // COW | |
252 | { | |
253 | } | |
254 | ||
255 | /// \post <tt>*this == that</tt> | |
256 | regex_token_iterator<BidiIter> &operator =(regex_token_iterator<BidiIter> const &that) | |
257 | { | |
258 | this->impl_ = that.impl_; // COW | |
259 | return *this; | |
260 | } | |
261 | ||
262 | friend bool operator ==(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right) | |
263 | { | |
264 | if(!left.impl_ || !right.impl_) | |
265 | { | |
266 | return !left.impl_ && !right.impl_; | |
267 | } | |
268 | ||
269 | return left.impl_->equal_to(*right.impl_); | |
270 | } | |
271 | ||
272 | friend bool operator !=(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right) | |
273 | { | |
274 | return !(left == right); | |
275 | } | |
276 | ||
277 | value_type const &operator *() const | |
278 | { | |
279 | return this->impl_->result_; | |
280 | } | |
281 | ||
282 | value_type const *operator ->() const | |
283 | { | |
284 | return &this->impl_->result_; | |
285 | } | |
286 | ||
287 | /// If N == -1 then sets *this equal to the end of sequence iterator. | |
288 | /// Otherwise if N+1 \< subs.size(), then increments N and sets result equal to | |
289 | /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())). | |
290 | /// Otherwise if what.prefix().first != what[0].second and if the element match_prev_avail is | |
291 | /// not set in flags then sets it. Then locates the next match as if by calling | |
292 | /// regex_search(what[0].second, end, what, *pre, flags), with the following variation: | |
293 | /// in the event that the previous match found was of zero length (what[0].length() == 0) | |
294 | /// then attempts to find a non-zero length match starting at what[0].second, only if that | |
295 | /// fails and provided what[0].second != suffix().second does it look for a (possibly zero | |
296 | /// length) match starting from what[0].second + 1. If such a match is found then sets N | |
297 | /// equal to zero, and sets result equal to | |
298 | /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())). | |
299 | /// Otherwise if no further matches were found, then let last_end be the endpoint of the last | |
300 | /// match that was found. Then if last_end != end and subs[0] == -1 sets N equal to -1 and | |
301 | /// sets result equal to value_type(last_end, end). Otherwise sets *this equal to the end | |
302 | /// of sequence iterator. | |
303 | regex_token_iterator<BidiIter> &operator ++() | |
304 | { | |
305 | this->fork_(); // un-share the implementation | |
306 | this->next_(); | |
307 | return *this; | |
308 | } | |
309 | ||
310 | regex_token_iterator<BidiIter> operator ++(int) | |
311 | { | |
312 | regex_token_iterator<BidiIter> tmp(*this); | |
313 | ++*this; | |
314 | return tmp; | |
315 | } | |
316 | ||
317 | private: | |
318 | ||
319 | /// INTERNAL ONLY | |
320 | void fork_() | |
321 | { | |
322 | if(1 != this->impl_->use_count()) | |
323 | { | |
324 | intrusive_ptr<impl_type_> clone = new impl_type_ | |
325 | ( | |
326 | this->impl_->iter_.state_.begin_ | |
327 | , this->impl_->iter_.state_.cur_ | |
328 | , this->impl_->iter_.state_.end_ | |
329 | , this->impl_->iter_.state_.next_search_ | |
330 | , this->impl_->iter_.rex_ | |
331 | , this->impl_->iter_.flags_ | |
332 | , this->impl_->subs_ | |
333 | , this->impl_->n_ | |
334 | , this->impl_->iter_.not_null_ | |
335 | ); | |
336 | ||
337 | // only copy the match_results struct if we have to. Note: if the next call | |
338 | // to impl_->next() will return false or call regex_search, we don't need to | |
339 | // copy the match_results struct. | |
340 | if(-1 != this->impl_->n_ && this->impl_->n_ + 1 != static_cast<int>(this->impl_->subs_.size())) | |
341 | { | |
342 | // BUGBUG This is expensive -- it causes the sequence_stack to be cleared. | |
343 | // Find a better way | |
344 | clone->iter_.what_ = this->impl_->iter_.what_; | |
345 | } | |
346 | else | |
347 | { | |
348 | // At the very least, copy the action args | |
349 | detail::core_access<BidiIter>::get_action_args(clone->iter_.what_) | |
350 | = detail::core_access<BidiIter>::get_action_args(this->impl_->iter_.what_); | |
351 | } | |
352 | ||
353 | this->impl_.swap(clone); | |
354 | } | |
355 | } | |
356 | ||
357 | /// INTERNAL ONLY | |
358 | void next_() | |
359 | { | |
360 | BOOST_ASSERT(this->impl_ && 1 == this->impl_->use_count()); | |
361 | if(!this->impl_->next()) | |
362 | { | |
363 | this->impl_ = 0; | |
364 | } | |
365 | } | |
366 | ||
367 | intrusive_ptr<impl_type_> impl_; | |
368 | }; | |
369 | ||
370 | }} // namespace boost::xpressive | |
371 | ||
372 | #endif |