]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // tokeniser.hpp |
2 | // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_RE_TOKENISER_HPP | |
7 | #define BOOST_LEXER_RE_TOKENISER_HPP | |
8 | ||
9 | // memcpy() | |
10 | #include <cstring> | |
11 | #include <map> | |
12 | #include "num_token.hpp" | |
13 | #include "../../runtime_error.hpp" | |
14 | #include "../../size_t.hpp" | |
15 | #include <sstream> | |
16 | #include "../../string_token.hpp" | |
17 | #include "re_tokeniser_helper.hpp" | |
18 | ||
19 | namespace boost | |
20 | { | |
21 | namespace lexer | |
22 | { | |
23 | namespace detail | |
24 | { | |
25 | template<typename CharT> | |
26 | class basic_re_tokeniser | |
27 | { | |
28 | public: | |
29 | typedef basic_num_token<CharT> num_token; | |
30 | typedef basic_re_tokeniser_state<CharT> state; | |
31 | typedef basic_string_token<CharT> string_token; | |
32 | typedef typename string_token::string string; | |
33 | typedef std::map<string_token, std::size_t> token_map; | |
34 | typedef std::pair<string_token, std::size_t> token_pair; | |
35 | ||
36 | static void next (state &state_, token_map &map_, num_token &token_) | |
37 | { | |
38 | CharT ch_ = 0; | |
39 | bool eos_ = state_.next (ch_); | |
40 | ||
41 | token_.min_max (0, false, 0); | |
42 | ||
43 | while (!eos_ && ch_ == '"') | |
44 | { | |
45 | state_._in_string ^= 1; | |
46 | eos_ = state_.next (ch_); | |
47 | } | |
48 | ||
49 | if (eos_) | |
50 | { | |
51 | if (state_._in_string) | |
52 | { | |
53 | throw runtime_error ("Unexpected end of regex " | |
54 | "(missing '\"')."); | |
55 | } | |
56 | ||
57 | if (state_._paren_count) | |
58 | { | |
59 | throw runtime_error ("Unexpected end of regex " | |
60 | "(missing ')')."); | |
61 | } | |
62 | ||
63 | token_.set (num_token::END, null_token); | |
64 | } | |
65 | else | |
66 | { | |
67 | if (ch_ == '\\') | |
68 | { | |
69 | // Even if we are in a string, respect escape sequences... | |
70 | escape (state_, map_, token_); | |
71 | } | |
72 | else if (state_._in_string) | |
73 | { | |
74 | // All other meta characters lose their special meaning | |
75 | // inside a string. | |
76 | create_charset_token (string (1, ch_), false, map_, token_); | |
77 | } | |
78 | else | |
79 | { | |
80 | // Not an escape sequence and not inside a string, so | |
81 | // check for meta characters. | |
82 | switch (ch_) | |
83 | { | |
84 | case '(': | |
85 | token_.set (num_token::OPENPAREN, null_token); | |
86 | ++state_._paren_count; | |
87 | read_options (state_); | |
88 | break; | |
89 | case ')': | |
90 | --state_._paren_count; | |
91 | ||
92 | if (state_._paren_count < 0) | |
93 | { | |
94 | std::ostringstream ss_; | |
95 | ||
96 | ss_ << "Number of open parenthesis < 0 at index " << | |
97 | state_.index () - 1 << '.'; | |
98 | throw runtime_error (ss_.str ().c_str ()); | |
99 | } | |
100 | ||
101 | token_.set (num_token::CLOSEPAREN, null_token); | |
102 | ||
103 | if (!state_._flags_stack.empty ()) | |
104 | { | |
105 | state_._flags = state_._flags_stack.top (); | |
106 | state_._flags_stack.pop (); | |
107 | } | |
108 | break; | |
109 | case '?': | |
110 | if (!state_.eos () && *state_._curr == '?') | |
111 | { | |
112 | token_.set (num_token::AOPT, null_token); | |
113 | state_.increment (); | |
114 | } | |
115 | else | |
116 | { | |
117 | token_.set (num_token::OPT, null_token); | |
118 | } | |
119 | ||
120 | break; | |
121 | case '*': | |
122 | if (!state_.eos () && *state_._curr == '?') | |
123 | { | |
124 | token_.set (num_token::AZEROORMORE, null_token); | |
125 | state_.increment (); | |
126 | } | |
127 | else | |
128 | { | |
129 | token_.set (num_token::ZEROORMORE, null_token); | |
130 | } | |
131 | ||
132 | break; | |
133 | case '+': | |
134 | if (!state_.eos () && *state_._curr == '?') | |
135 | { | |
136 | token_.set (num_token::AONEORMORE, null_token); | |
137 | state_.increment (); | |
138 | } | |
139 | else | |
140 | { | |
141 | token_.set (num_token::ONEORMORE, null_token); | |
142 | } | |
143 | ||
144 | break; | |
145 | case '{': | |
146 | open_curly (state_, token_); | |
147 | break; | |
148 | case '|': | |
149 | token_.set (num_token::OR, null_token); | |
150 | break; | |
151 | case '^': | |
152 | if (state_._curr - 1 == state_._start) | |
153 | { | |
154 | token_.set (num_token::CHARSET, bol_token); | |
155 | state_._seen_BOL_assertion = true; | |
156 | } | |
157 | else | |
158 | { | |
159 | create_charset_token (string (1, ch_), false, | |
160 | map_, token_); | |
161 | } | |
162 | ||
163 | break; | |
164 | case '$': | |
165 | if (state_._curr == state_._end) | |
166 | { | |
167 | token_.set (num_token::CHARSET, eol_token); | |
168 | state_._seen_EOL_assertion = true; | |
169 | } | |
170 | else | |
171 | { | |
172 | create_charset_token (string (1, ch_), false, | |
173 | map_, token_); | |
174 | } | |
175 | ||
176 | break; | |
177 | case '.': | |
178 | { | |
179 | string dot_; | |
180 | ||
181 | if (state_._flags & dot_not_newline) | |
182 | { | |
183 | dot_ = '\n'; | |
184 | } | |
185 | ||
186 | create_charset_token (dot_, true, map_, token_); | |
187 | break; | |
188 | } | |
189 | case '[': | |
190 | { | |
191 | charset (state_, map_, token_); | |
192 | break; | |
193 | } | |
194 | case '/': | |
195 | throw runtime_error("Lookahead ('/') is not supported yet."); | |
196 | break; | |
197 | default: | |
198 | if ((state_._flags & icase) && | |
199 | (std::isupper (ch_, state_._locale) || | |
200 | std::islower (ch_, state_._locale))) | |
201 | { | |
202 | CharT upper_ = std::toupper (ch_, state_._locale); | |
203 | CharT lower_ = std::tolower (ch_, state_._locale); | |
204 | ||
205 | string str_ (1, upper_); | |
206 | ||
207 | str_ += lower_; | |
208 | create_charset_token (str_, false, map_, token_); | |
209 | } | |
210 | else | |
211 | { | |
212 | create_charset_token (string (1, ch_), false, | |
213 | map_, token_); | |
214 | } | |
215 | ||
216 | break; | |
217 | } | |
218 | } | |
219 | } | |
220 | } | |
221 | ||
222 | private: | |
223 | typedef basic_re_tokeniser_helper<CharT> tokeniser_helper; | |
224 | ||
225 | static void read_options (state &state_) | |
226 | { | |
227 | if (!state_.eos () && *state_._curr == '?') | |
228 | { | |
229 | CharT ch_ = 0; | |
230 | bool eos_ = false; | |
231 | bool negate_ = false; | |
232 | ||
233 | state_.increment (); | |
234 | eos_ = state_.next (ch_); | |
235 | state_._flags_stack.push (state_._flags); | |
236 | ||
237 | while (!eos_ && ch_ != ':') | |
238 | { | |
239 | switch (ch_) | |
240 | { | |
241 | case '-': | |
242 | negate_ ^= 1; | |
243 | break; | |
244 | case 'i': | |
245 | if (negate_) | |
246 | { | |
247 | state_._flags = static_cast<regex_flags> | |
248 | (state_._flags & ~icase); | |
249 | } | |
250 | else | |
251 | { | |
252 | state_._flags = static_cast<regex_flags> | |
253 | (state_._flags | icase); | |
254 | } | |
255 | ||
256 | negate_ = false; | |
257 | break; | |
258 | case 's': | |
259 | if (negate_) | |
260 | { | |
261 | state_._flags = static_cast<regex_flags> | |
262 | (state_._flags | dot_not_newline); | |
263 | } | |
264 | else | |
265 | { | |
266 | state_._flags = static_cast<regex_flags> | |
267 | (state_._flags & ~dot_not_newline); | |
268 | } | |
269 | ||
270 | negate_ = false; | |
271 | break; | |
272 | default: | |
273 | { | |
274 | std::ostringstream ss_; | |
275 | ||
276 | ss_ << "Unknown option at index " << | |
277 | state_.index () - 1 << '.'; | |
278 | throw runtime_error (ss_.str ().c_str ()); | |
279 | } | |
280 | } | |
281 | ||
282 | eos_ = state_.next (ch_); | |
283 | } | |
284 | ||
285 | // End of string handler will handle early termination | |
286 | } | |
287 | else if (!state_._flags_stack.empty ()) | |
288 | { | |
289 | state_._flags_stack.push (state_._flags); | |
290 | } | |
291 | } | |
292 | ||
293 | static void escape (state &state_, token_map &map_, num_token &token_) | |
294 | { | |
295 | CharT ch_ = 0; | |
296 | std::size_t str_len_ = 0; | |
297 | const CharT *str_ = tokeniser_helper::escape_sequence (state_, | |
298 | ch_, str_len_); | |
299 | ||
300 | if (str_) | |
301 | { | |
302 | state state2_ (str_ + 1, str_ + str_len_, state_._flags, | |
303 | state_._locale); | |
304 | ||
305 | charset (state2_, map_, token_); | |
306 | } | |
307 | else | |
308 | { | |
309 | create_charset_token (string (1, ch_), false, map_, token_); | |
310 | } | |
311 | } | |
312 | ||
313 | static void charset (state &state_, token_map &map_, num_token &token_) | |
314 | { | |
315 | string chars_; | |
316 | bool negated_ = false; | |
317 | ||
318 | tokeniser_helper::charset (state_, chars_, negated_); | |
319 | create_charset_token (chars_, negated_, map_, token_); | |
320 | } | |
321 | ||
322 | static void create_charset_token (const string &charset_, | |
323 | const bool negated_, token_map &map_, num_token &token_) | |
324 | { | |
325 | std::size_t id_ = null_token; | |
326 | string_token stok_ (negated_, charset_); | |
327 | ||
328 | stok_.remove_duplicates (); | |
329 | stok_.normalise (); | |
330 | ||
331 | typename token_map::const_iterator iter_ = map_.find (stok_); | |
332 | ||
333 | if (iter_ == map_.end ()) | |
334 | { | |
335 | id_ = map_.size (); | |
336 | map_.insert (token_pair (stok_, id_)); | |
337 | } | |
338 | else | |
339 | { | |
340 | id_ = iter_->second; | |
341 | } | |
342 | ||
343 | token_.set (num_token::CHARSET, id_); | |
344 | } | |
345 | ||
346 | static void open_curly (state &state_, num_token &token_) | |
347 | { | |
348 | if (state_.eos ()) | |
349 | { | |
350 | throw runtime_error ("Unexpected end of regex " | |
351 | "(missing '}')."); | |
352 | } | |
353 | else if (*state_._curr >= '0' && *state_._curr <= '9') | |
354 | { | |
355 | repeat_n (state_, token_); | |
356 | ||
357 | if (!state_.eos () && *state_._curr == '?') | |
358 | { | |
359 | token_._type = num_token::AREPEATN; | |
360 | state_.increment (); | |
361 | } | |
362 | } | |
363 | else | |
364 | { | |
365 | macro (state_, token_); | |
366 | } | |
367 | } | |
368 | ||
369 | // SYNTAX: | |
370 | // {n[,[n]]} | |
371 | // SEMANTIC RULES: | |
372 | // {0} - INVALID (throw exception) | |
373 | // {0,} = * | |
374 | // {0,0} - INVALID (throw exception) | |
375 | // {0,1} = ? | |
376 | // {1,} = + | |
377 | // {min,max} where min == max - {min} | |
378 | // {min,max} where max < min - INVALID (throw exception) | |
379 | static void repeat_n (state &state_, num_token &token_) | |
380 | { | |
381 | CharT ch_ = 0; | |
382 | bool eos_ = state_.next (ch_); | |
383 | ||
384 | while (!eos_ && ch_ >= '0' && ch_ <= '9') | |
385 | { | |
386 | token_._min *= 10; | |
387 | token_._min += ch_ - '0'; | |
388 | eos_ = state_.next (ch_); | |
389 | } | |
390 | ||
391 | if (eos_) | |
392 | { | |
393 | throw runtime_error ("Unexpected end of regex " | |
394 | "(missing '}')."); | |
395 | } | |
396 | ||
397 | bool min_max_ = false; | |
398 | bool repeatn_ = true; | |
399 | ||
400 | token_._comma = ch_ == ','; | |
401 | ||
402 | if (token_._comma) | |
403 | { | |
404 | eos_ = state_.next (ch_); | |
405 | ||
406 | if (eos_) | |
407 | { | |
408 | throw runtime_error ("Unexpected end of regex " | |
409 | "(missing '}')."); | |
410 | } | |
411 | ||
412 | if (ch_ == '}') | |
413 | { | |
414 | // Small optimisation: Check for '*' equivalency. | |
415 | if (token_._min == 0) | |
416 | { | |
417 | token_.set (num_token::ZEROORMORE, null_token); | |
418 | repeatn_ = false; | |
419 | } | |
420 | // Small optimisation: Check for '+' equivalency. | |
421 | else if (token_._min == 1) | |
422 | { | |
423 | token_.set (num_token::ONEORMORE, null_token); | |
424 | repeatn_ = false; | |
425 | } | |
426 | } | |
427 | else | |
428 | { | |
429 | if (ch_ < '0' || ch_ > '9') | |
430 | { | |
431 | std::ostringstream ss_; | |
432 | ||
433 | ss_ << "Missing '}' at index " << | |
434 | state_.index () - 1 << '.'; | |
435 | throw runtime_error (ss_.str ().c_str ()); | |
436 | } | |
437 | ||
438 | min_max_ = true; | |
439 | ||
440 | do | |
441 | { | |
442 | token_._max *= 10; | |
443 | token_._max += ch_ - '0'; | |
444 | eos_ = state_.next (ch_); | |
445 | } while (!eos_ && ch_ >= '0' && ch_ <= '9'); | |
446 | ||
447 | if (eos_) | |
448 | { | |
449 | throw runtime_error ("Unexpected end of regex " | |
450 | "(missing '}')."); | |
451 | } | |
452 | ||
453 | // Small optimisation: Check for '?' equivalency. | |
454 | if (token_._min == 0 && token_._max == 1) | |
455 | { | |
456 | token_.set (num_token::OPT, null_token); | |
457 | repeatn_ = false; | |
458 | } | |
459 | // Small optimisation: if min == max, then min. | |
460 | else if (token_._min == token_._max) | |
461 | { | |
462 | token_._comma = false; | |
463 | min_max_ = false; | |
464 | token_._max = 0; | |
465 | } | |
466 | } | |
467 | } | |
468 | ||
469 | if (ch_ != '}') | |
470 | { | |
471 | std::ostringstream ss_; | |
472 | ||
473 | ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; | |
474 | throw runtime_error (ss_.str ().c_str ()); | |
475 | } | |
476 | ||
477 | if (repeatn_) | |
478 | { | |
479 | // SEMANTIC VALIDATION follows: | |
480 | // NOTE: {0,} has already become * | |
481 | // therefore we don't check for a comma. | |
482 | if (token_._min == 0 && token_._max == 0) | |
483 | { | |
484 | std::ostringstream ss_; | |
485 | ||
486 | ss_ << "Cannot have exactly zero repeats preceding index " << | |
487 | state_.index () << '.'; | |
488 | throw runtime_error (ss_.str ().c_str ()); | |
489 | } | |
490 | ||
491 | if (min_max_ && token_._max < token_._min) | |
492 | { | |
493 | std::ostringstream ss_; | |
494 | ||
495 | ss_ << "Max less than min preceding index " << | |
496 | state_.index () << '.'; | |
497 | throw runtime_error (ss_.str ().c_str ()); | |
498 | } | |
499 | ||
500 | token_.set (num_token::REPEATN, null_token); | |
501 | } | |
502 | } | |
503 | ||
504 | static void macro (state &state_, num_token &token_) | |
505 | { | |
506 | CharT ch_ = 0; | |
507 | bool eos_ = false; | |
508 | const CharT *start_ = state_._curr; | |
509 | ||
510 | state_.next (ch_); | |
511 | ||
512 | if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && | |
513 | !(ch_ >= 'a' && ch_ <= 'z')) | |
514 | { | |
515 | std::ostringstream ss_; | |
516 | ||
517 | ss_ << "Invalid MACRO name at index " << | |
518 | state_.index () - 1 << '.'; | |
519 | throw runtime_error (ss_.str ().c_str ()); | |
520 | } | |
521 | ||
522 | do | |
523 | { | |
524 | eos_ = state_.next (ch_); | |
525 | ||
526 | if (eos_) | |
527 | { | |
528 | throw runtime_error ("Unexpected end of regex " | |
529 | "(missing '}')."); | |
530 | } | |
531 | } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || | |
532 | (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); | |
533 | ||
534 | if (ch_ != '}') | |
535 | { | |
536 | std::ostringstream ss_; | |
537 | ||
538 | ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; | |
539 | throw runtime_error (ss_.str ().c_str ()); | |
540 | } | |
541 | ||
542 | std::size_t len_ = state_._curr - 1 - start_; | |
543 | ||
544 | if (len_ > max_macro_len) | |
545 | { | |
546 | std::basic_stringstream<CharT> ss_; | |
547 | std::ostringstream os_; | |
548 | ||
549 | os_ << "MACRO name '"; | |
550 | ||
551 | while (len_) | |
552 | { | |
553 | os_ << ss_.narrow (*start_++, ' '); | |
554 | --len_; | |
555 | } | |
556 | ||
557 | os_ << "' too long."; | |
558 | throw runtime_error (os_.str ()); | |
559 | } | |
560 | ||
561 | token_.set (num_token::MACRO, null_token); | |
562 | ||
563 | // Some systems have memcpy in namespace std. | |
564 | using namespace std; | |
565 | ||
566 | memcpy (token_._macro, start_, len_ * sizeof (CharT)); | |
567 | token_._macro[len_] = 0; | |
568 | } | |
569 | }; | |
570 | } | |
571 | } | |
572 | } | |
573 | ||
574 | #endif |