]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /////////////////////////////////////////////////////////////////////////////// |
2 | // parse_charset.hpp | |
3 | // | |
4 | // Copyright 2008 Eric Niebler. Distributed under the Boost | |
5 | // Software License, Version 1.0. (See accompanying file | |
6 | // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
7 | ||
8 | #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 | |
9 | #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 | |
10 | ||
11 | // MS compatible compilers support #pragma once | |
12 | #if defined(_MSC_VER) | |
13 | # pragma once | |
14 | #endif | |
15 | ||
16 | #include <boost/config.hpp> | |
17 | #include <boost/integer.hpp> | |
18 | #include <boost/mpl/bool.hpp> | |
19 | #include <boost/throw_exception.hpp> | |
20 | #include <boost/numeric/conversion/converter.hpp> | |
21 | #include <boost/xpressive/detail/detail_fwd.hpp> | |
22 | #include <boost/xpressive/detail/dynamic/parser_enum.hpp> | |
23 | #include <boost/xpressive/detail/utility/literals.hpp> | |
24 | #include <boost/xpressive/detail/utility/chset/chset.hpp> | |
25 | #include <boost/xpressive/regex_constants.hpp> | |
26 | ||
27 | namespace boost { namespace xpressive { namespace detail | |
28 | { | |
29 | ||
30 | enum escape_type | |
31 | { | |
32 | escape_char | |
33 | , escape_mark | |
34 | , escape_class | |
35 | }; | |
36 | ||
37 | /////////////////////////////////////////////////////////////////////////////// | |
38 | // escape_value | |
39 | // | |
40 | template<typename Char, typename Class> | |
41 | struct escape_value | |
42 | { | |
43 | Char ch_; | |
44 | int mark_nbr_; | |
45 | Class class_; | |
46 | escape_type type_; | |
47 | }; | |
48 | ||
49 | /////////////////////////////////////////////////////////////////////////////// | |
50 | // char_overflow_handler | |
51 | // | |
52 | struct char_overflow_handler | |
53 | { | |
54 | void operator ()(numeric::range_check_result result) const // throw(regex_error) | |
55 | { | |
56 | if(numeric::cInRange != result) | |
57 | { | |
58 | BOOST_THROW_EXCEPTION( | |
59 | regex_error( | |
60 | regex_constants::error_escape | |
61 | , "character escape too large to fit in target character type" | |
62 | ) | |
63 | ); | |
64 | } | |
65 | } | |
66 | }; | |
67 | ||
68 | /////////////////////////////////////////////////////////////////////////////// | |
69 | // parse_escape | |
70 | // | |
71 | template<typename FwdIter, typename CompilerTraits> | |
72 | escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type> | |
73 | parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr) | |
74 | { | |
75 | using namespace regex_constants; | |
76 | typedef typename iterator_value<FwdIter>::type char_type; | |
77 | typedef typename CompilerTraits::regex_traits regex_traits; | |
78 | typedef typename regex_traits::char_class_type char_class_type; | |
79 | ||
80 | // define an unsigned type the same size as char_type | |
81 | typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t; | |
82 | BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type)); | |
83 | typedef numeric::conversion_traits<uchar_t, int> converstion_traits; | |
84 | ||
85 | BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found"); | |
86 | numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter; | |
87 | escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char }; | |
88 | bool const icase = (0 != (regex_constants::icase_ & tr.flags())); | |
89 | regex_traits const &rxtraits = tr.traits(); | |
90 | FwdIter tmp; | |
91 | ||
92 | esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase); | |
93 | if(0 != esc.class_) | |
94 | { | |
95 | esc.type_ = escape_class; | |
96 | return esc; | |
97 | } | |
98 | ||
99 | if(-1 != rxtraits.value(*begin, 8)) | |
100 | { | |
101 | esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777)); | |
102 | return esc; | |
103 | } | |
104 | ||
105 | switch(*begin) | |
106 | { | |
107 | // bell character | |
108 | case BOOST_XPR_CHAR_(char_type, 'a'): | |
109 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a'); | |
110 | ++begin; | |
111 | break; | |
112 | // escape character | |
113 | case BOOST_XPR_CHAR_(char_type, 'e'): | |
114 | esc.ch_ = converter(27); | |
115 | ++begin; | |
116 | break; | |
117 | // control character | |
118 | case BOOST_XPR_CHAR_(char_type, 'c'): | |
119 | BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); | |
120 | BOOST_XPR_ENSURE_ | |
121 | ( | |
122 | rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin) | |
123 | || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin) | |
124 | , error_escape | |
125 | , "invalid escape control letter; must be one of a-z or A-Z" | |
126 | ); | |
127 | // Convert to character according to ECMA-262, section 15.10.2.10: | |
128 | esc.ch_ = converter(*begin % 32); | |
129 | ++begin; | |
130 | break; | |
131 | // formfeed character | |
132 | case BOOST_XPR_CHAR_(char_type, 'f'): | |
133 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f'); | |
134 | ++begin; | |
135 | break; | |
136 | // newline | |
137 | case BOOST_XPR_CHAR_(char_type, 'n'): | |
138 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n'); | |
139 | ++begin; | |
140 | break; | |
141 | // return | |
142 | case BOOST_XPR_CHAR_(char_type, 'r'): | |
143 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r'); | |
144 | ++begin; | |
145 | break; | |
146 | // horizontal tab | |
147 | case BOOST_XPR_CHAR_(char_type, 't'): | |
148 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t'); | |
149 | ++begin; | |
150 | break; | |
151 | // vertical tab | |
152 | case BOOST_XPR_CHAR_(char_type, 'v'): | |
153 | esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v'); | |
154 | ++begin; | |
155 | break; | |
156 | // hex escape sequence | |
157 | case BOOST_XPR_CHAR_(char_type, 'x'): | |
158 | BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); | |
159 | tmp = begin; | |
160 | esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff)); | |
161 | BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : " | |
162 | "must be \\x HexDigit HexDigit"); | |
163 | break; | |
164 | // Unicode escape sequence | |
165 | case BOOST_XPR_CHAR_(char_type, 'u'): | |
166 | BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); | |
167 | tmp = begin; | |
168 | esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff)); | |
169 | BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : " | |
170 | "must be \\u HexDigit HexDigit HexDigit HexDigit"); | |
171 | break; | |
172 | // backslash | |
173 | case BOOST_XPR_CHAR_(char_type, '\\'): | |
174 | //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\'); | |
175 | //++begin; | |
176 | //break; | |
177 | // all other escaped characters represent themselves | |
178 | default: | |
179 | esc.ch_ = *begin; | |
180 | ++begin; | |
181 | break; | |
182 | } | |
183 | ||
184 | return esc; | |
185 | } | |
186 | ||
187 | ////////////////////////////////////////////////////////////////////////// | |
188 | // parse_charset | |
189 | // | |
190 | template<typename FwdIter, typename RegexTraits, typename CompilerTraits> | |
191 | inline void parse_charset | |
192 | ( | |
193 | FwdIter &begin | |
194 | , FwdIter end | |
195 | , compound_charset<RegexTraits> &chset | |
196 | , CompilerTraits &tr | |
197 | ) | |
198 | { | |
199 | using namespace regex_constants; | |
200 | typedef typename RegexTraits::char_type char_type; | |
201 | typedef typename RegexTraits::char_class_type char_class_type; | |
202 | BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); | |
203 | RegexTraits const &rxtraits = tr.traits(); | |
204 | bool const icase = (0 != (regex_constants::icase_ & tr.flags())); | |
205 | FwdIter iprev = FwdIter(); | |
206 | escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char}; | |
207 | bool invert = false; | |
208 | ||
209 | // check to see if we have an inverse charset | |
210 | if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end)) | |
211 | { | |
212 | begin = iprev; | |
213 | invert = true; | |
214 | } | |
215 | ||
216 | // skip the end token if-and-only-if it is the first token in the charset | |
217 | if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end)) | |
218 | { | |
219 | for(; begin != iprev; ++begin) | |
220 | { | |
221 | chset.set_char(*begin, rxtraits, icase); | |
222 | } | |
223 | } | |
224 | ||
225 | compiler_token_type tok; | |
226 | char_type ch_prev = char_type(), ch_next = char_type(); | |
227 | bool have_prev = false; | |
228 | ||
229 | BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); | |
230 | ||
231 | // remember the current position and grab the next token | |
232 | iprev = begin; | |
233 | tok = tr.get_charset_token(begin, end); | |
234 | do | |
235 | { | |
236 | BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); | |
237 | ||
238 | if(token_charset_hyphen == tok && have_prev) | |
239 | { | |
240 | // remember the current position | |
241 | FwdIter iprev2 = begin; | |
242 | have_prev = false; | |
243 | ||
244 | // ch_prev is lower bound of a range | |
245 | switch(tr.get_charset_token(begin, end)) | |
246 | { | |
247 | case token_charset_hyphen: | |
248 | case token_charset_invert: | |
249 | begin = iprev2; // un-get these tokens and fall through | |
250 | BOOST_FALLTHROUGH; | |
251 | case token_literal: | |
252 | ch_next = *begin++; | |
253 | BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); | |
254 | chset.set_range(ch_prev, ch_next, rxtraits, icase); | |
255 | continue; | |
256 | case token_charset_backspace: | |
257 | ch_next = char_type(8); // backspace | |
258 | BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); | |
259 | chset.set_range(ch_prev, ch_next, rxtraits, icase); | |
260 | continue; | |
261 | case token_escape: | |
262 | esc = parse_escape(begin, end, tr); | |
263 | if(escape_char == esc.type_) | |
264 | { | |
265 | BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range"); | |
266 | chset.set_range(ch_prev, esc.ch_, rxtraits, icase); | |
267 | continue; | |
268 | } | |
269 | BOOST_FALLTHROUGH; | |
270 | case token_charset_end: | |
271 | default: // not a range. | |
272 | begin = iprev; // backup to hyphen token | |
273 | chset.set_char(ch_prev, rxtraits, icase); | |
274 | chset.set_char(*begin++, rxtraits, icase); | |
275 | continue; | |
276 | } | |
277 | } | |
278 | ||
279 | if(have_prev) | |
280 | { | |
281 | chset.set_char(ch_prev, rxtraits, icase); | |
282 | have_prev = false; | |
283 | } | |
284 | ||
285 | switch(tok) | |
286 | { | |
287 | case token_charset_hyphen: | |
288 | case token_charset_invert: | |
289 | case token_charset_end: | |
290 | case token_posix_charset_end: | |
291 | begin = iprev; // un-get these tokens | |
292 | ch_prev = *begin++; | |
293 | have_prev = true; | |
294 | continue; | |
295 | ||
296 | case token_charset_backspace: | |
297 | ch_prev = char_type(8); // backspace | |
298 | have_prev = true; | |
299 | continue; | |
300 | ||
301 | case token_posix_charset_begin: | |
302 | { | |
303 | FwdIter tmp = begin, start = begin; | |
304 | bool invert = (token_charset_invert == tr.get_charset_token(tmp, end)); | |
305 | if(invert) | |
306 | { | |
307 | begin = start = tmp; | |
308 | } | |
309 | while(token_literal == (tok = tr.get_charset_token(begin, end))) | |
310 | { | |
311 | tmp = ++begin; | |
312 | BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); | |
313 | } | |
314 | if(token_posix_charset_end == tok) | |
315 | { | |
316 | char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase); | |
317 | BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name"); | |
318 | chset.set_class(chclass, invert); | |
319 | continue; | |
320 | } | |
321 | begin = iprev; // un-get this token | |
322 | ch_prev = *begin++; | |
323 | have_prev = true; | |
324 | } | |
325 | continue; | |
326 | ||
327 | case token_escape: | |
328 | esc = parse_escape(begin, end, tr); | |
329 | if(escape_char == esc.type_) | |
330 | { | |
331 | ch_prev = esc.ch_; | |
332 | have_prev = true; | |
333 | } | |
334 | else if(escape_class == esc.type_) | |
335 | { | |
336 | char_class_type upper_ = lookup_classname(rxtraits, "upper"); | |
337 | BOOST_ASSERT(0 != upper_); | |
338 | chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_)); | |
339 | } | |
340 | else | |
341 | { | |
342 | BOOST_ASSERT(false); | |
343 | } | |
344 | continue; | |
345 | ||
346 | default: | |
347 | ch_prev = *begin++; | |
348 | have_prev = true; | |
349 | continue; | |
350 | } | |
351 | } | |
352 | while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"), | |
353 | token_charset_end != (tok = tr.get_charset_token(begin, end))); | |
354 | ||
355 | if(have_prev) | |
356 | { | |
357 | chset.set_char(ch_prev, rxtraits, icase); | |
358 | } | |
359 | ||
360 | if(invert) | |
361 | { | |
362 | chset.inverse(); | |
363 | } | |
364 | } | |
365 | ||
366 | }}} // namespace boost::xpressive::detail | |
367 | ||
368 | #endif |