]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/spirit/include/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / spirit / include / boost / spirit / home / support / detail / lexer / parser / tokeniser / re_tokeniser.hpp
1 // tokeniser.hpp
2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 #ifndef BOOST_LEXER_RE_TOKENISER_HPP
7 #define BOOST_LEXER_RE_TOKENISER_HPP
8
9 // memcpy()
10 #include <cstring>
11 #include <map>
12 #include "num_token.hpp"
13 #include "../../runtime_error.hpp"
14 #include "../../size_t.hpp"
15 #include <sstream>
16 #include "../../string_token.hpp"
17 #include "re_tokeniser_helper.hpp"
18
19 namespace boost
20 {
21 namespace lexer
22 {
23 namespace detail
24 {
25 template<typename CharT>
26 class basic_re_tokeniser
27 {
28 public:
29 typedef basic_num_token<CharT> num_token;
30 typedef basic_re_tokeniser_state<CharT> state;
31 typedef basic_string_token<CharT> string_token;
32 typedef typename string_token::string string;
33 typedef std::map<string_token, std::size_t> token_map;
34 typedef std::pair<string_token, std::size_t> token_pair;
35
36 static void next (state &state_, token_map &map_, num_token &token_)
37 {
38 CharT ch_ = 0;
39 bool eos_ = state_.next (ch_);
40
41 token_.min_max (0, false, 0);
42
43 while (!eos_ && ch_ == '"')
44 {
45 state_._in_string ^= 1;
46 eos_ = state_.next (ch_);
47 }
48
49 if (eos_)
50 {
51 if (state_._in_string)
52 {
53 throw runtime_error ("Unexpected end of regex "
54 "(missing '\"').");
55 }
56
57 if (state_._paren_count)
58 {
59 throw runtime_error ("Unexpected end of regex "
60 "(missing ')').");
61 }
62
63 token_.set (num_token::END, null_token);
64 }
65 else
66 {
67 if (ch_ == '\\')
68 {
69 // Even if we are in a string, respect escape sequences...
70 escape (state_, map_, token_);
71 }
72 else if (state_._in_string)
73 {
74 // All other meta characters lose their special meaning
75 // inside a string.
76 create_charset_token (string (1, ch_), false, map_, token_);
77 }
78 else
79 {
80 // Not an escape sequence and not inside a string, so
81 // check for meta characters.
82 switch (ch_)
83 {
84 case '(':
85 token_.set (num_token::OPENPAREN, null_token);
86 ++state_._paren_count;
87 read_options (state_);
88 break;
89 case ')':
90 --state_._paren_count;
91
92 if (state_._paren_count < 0)
93 {
94 std::ostringstream ss_;
95
96 ss_ << "Number of open parenthesis < 0 at index " <<
97 state_.index () - 1 << '.';
98 throw runtime_error (ss_.str ().c_str ());
99 }
100
101 token_.set (num_token::CLOSEPAREN, null_token);
102
103 if (!state_._flags_stack.empty ())
104 {
105 state_._flags = state_._flags_stack.top ();
106 state_._flags_stack.pop ();
107 }
108 break;
109 case '?':
110 if (!state_.eos () && *state_._curr == '?')
111 {
112 token_.set (num_token::AOPT, null_token);
113 state_.increment ();
114 }
115 else
116 {
117 token_.set (num_token::OPT, null_token);
118 }
119
120 break;
121 case '*':
122 if (!state_.eos () && *state_._curr == '?')
123 {
124 token_.set (num_token::AZEROORMORE, null_token);
125 state_.increment ();
126 }
127 else
128 {
129 token_.set (num_token::ZEROORMORE, null_token);
130 }
131
132 break;
133 case '+':
134 if (!state_.eos () && *state_._curr == '?')
135 {
136 token_.set (num_token::AONEORMORE, null_token);
137 state_.increment ();
138 }
139 else
140 {
141 token_.set (num_token::ONEORMORE, null_token);
142 }
143
144 break;
145 case '{':
146 open_curly (state_, token_);
147 break;
148 case '|':
149 token_.set (num_token::OR, null_token);
150 break;
151 case '^':
152 if (state_._curr - 1 == state_._start)
153 {
154 token_.set (num_token::CHARSET, bol_token);
155 state_._seen_BOL_assertion = true;
156 }
157 else
158 {
159 create_charset_token (string (1, ch_), false,
160 map_, token_);
161 }
162
163 break;
164 case '$':
165 if (state_._curr == state_._end)
166 {
167 token_.set (num_token::CHARSET, eol_token);
168 state_._seen_EOL_assertion = true;
169 }
170 else
171 {
172 create_charset_token (string (1, ch_), false,
173 map_, token_);
174 }
175
176 break;
177 case '.':
178 {
179 string dot_;
180
181 if (state_._flags & dot_not_newline)
182 {
183 dot_ = '\n';
184 }
185
186 create_charset_token (dot_, true, map_, token_);
187 break;
188 }
189 case '[':
190 {
191 charset (state_, map_, token_);
192 break;
193 }
194 case '/':
195 throw runtime_error("Lookahead ('/') is not supported yet.");
196 break;
197 default:
198 if ((state_._flags & icase) &&
199 (std::isupper (ch_, state_._locale) ||
200 std::islower (ch_, state_._locale)))
201 {
202 CharT upper_ = std::toupper (ch_, state_._locale);
203 CharT lower_ = std::tolower (ch_, state_._locale);
204
205 string str_ (1, upper_);
206
207 str_ += lower_;
208 create_charset_token (str_, false, map_, token_);
209 }
210 else
211 {
212 create_charset_token (string (1, ch_), false,
213 map_, token_);
214 }
215
216 break;
217 }
218 }
219 }
220 }
221
222 private:
223 typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
224
225 static void read_options (state &state_)
226 {
227 if (!state_.eos () && *state_._curr == '?')
228 {
229 CharT ch_ = 0;
230 bool eos_ = false;
231 bool negate_ = false;
232
233 state_.increment ();
234 eos_ = state_.next (ch_);
235 state_._flags_stack.push (state_._flags);
236
237 while (!eos_ && ch_ != ':')
238 {
239 switch (ch_)
240 {
241 case '-':
242 negate_ ^= 1;
243 break;
244 case 'i':
245 if (negate_)
246 {
247 state_._flags = static_cast<regex_flags>
248 (state_._flags & ~icase);
249 }
250 else
251 {
252 state_._flags = static_cast<regex_flags>
253 (state_._flags | icase);
254 }
255
256 negate_ = false;
257 break;
258 case 's':
259 if (negate_)
260 {
261 state_._flags = static_cast<regex_flags>
262 (state_._flags | dot_not_newline);
263 }
264 else
265 {
266 state_._flags = static_cast<regex_flags>
267 (state_._flags & ~dot_not_newline);
268 }
269
270 negate_ = false;
271 break;
272 default:
273 {
274 std::ostringstream ss_;
275
276 ss_ << "Unknown option at index " <<
277 state_.index () - 1 << '.';
278 throw runtime_error (ss_.str ().c_str ());
279 }
280 }
281
282 eos_ = state_.next (ch_);
283 }
284
285 // End of string handler will handle early termination
286 }
287 else if (!state_._flags_stack.empty ())
288 {
289 state_._flags_stack.push (state_._flags);
290 }
291 }
292
293 static void escape (state &state_, token_map &map_, num_token &token_)
294 {
295 CharT ch_ = 0;
296 std::size_t str_len_ = 0;
297 const CharT *str_ = tokeniser_helper::escape_sequence (state_,
298 ch_, str_len_);
299
300 if (str_)
301 {
302 state state2_ (str_ + 1, str_ + str_len_, state_._flags,
303 state_._locale);
304
305 charset (state2_, map_, token_);
306 }
307 else
308 {
309 create_charset_token (string (1, ch_), false, map_, token_);
310 }
311 }
312
313 static void charset (state &state_, token_map &map_, num_token &token_)
314 {
315 string chars_;
316 bool negated_ = false;
317
318 tokeniser_helper::charset (state_, chars_, negated_);
319 create_charset_token (chars_, negated_, map_, token_);
320 }
321
322 static void create_charset_token (const string &charset_,
323 const bool negated_, token_map &map_, num_token &token_)
324 {
325 std::size_t id_ = null_token;
326 string_token stok_ (negated_, charset_);
327
328 stok_.remove_duplicates ();
329 stok_.normalise ();
330
331 typename token_map::const_iterator iter_ = map_.find (stok_);
332
333 if (iter_ == map_.end ())
334 {
335 id_ = map_.size ();
336 map_.insert (token_pair (stok_, id_));
337 }
338 else
339 {
340 id_ = iter_->second;
341 }
342
343 token_.set (num_token::CHARSET, id_);
344 }
345
346 static void open_curly (state &state_, num_token &token_)
347 {
348 if (state_.eos ())
349 {
350 throw runtime_error ("Unexpected end of regex "
351 "(missing '}').");
352 }
353 else if (*state_._curr >= '0' && *state_._curr <= '9')
354 {
355 repeat_n (state_, token_);
356
357 if (!state_.eos () && *state_._curr == '?')
358 {
359 token_._type = num_token::AREPEATN;
360 state_.increment ();
361 }
362 }
363 else
364 {
365 macro (state_, token_);
366 }
367 }
368
369 // SYNTAX:
370 // {n[,[n]]}
371 // SEMANTIC RULES:
372 // {0} - INVALID (throw exception)
373 // {0,} = *
374 // {0,0} - INVALID (throw exception)
375 // {0,1} = ?
376 // {1,} = +
377 // {min,max} where min == max - {min}
378 // {min,max} where max < min - INVALID (throw exception)
379 static void repeat_n (state &state_, num_token &token_)
380 {
381 CharT ch_ = 0;
382 bool eos_ = state_.next (ch_);
383
384 while (!eos_ && ch_ >= '0' && ch_ <= '9')
385 {
386 token_._min *= 10;
387 token_._min += ch_ - '0';
388 eos_ = state_.next (ch_);
389 }
390
391 if (eos_)
392 {
393 throw runtime_error ("Unexpected end of regex "
394 "(missing '}').");
395 }
396
397 bool min_max_ = false;
398 bool repeatn_ = true;
399
400 token_._comma = ch_ == ',';
401
402 if (token_._comma)
403 {
404 eos_ = state_.next (ch_);
405
406 if (eos_)
407 {
408 throw runtime_error ("Unexpected end of regex "
409 "(missing '}').");
410 }
411
412 if (ch_ == '}')
413 {
414 // Small optimisation: Check for '*' equivalency.
415 if (token_._min == 0)
416 {
417 token_.set (num_token::ZEROORMORE, null_token);
418 repeatn_ = false;
419 }
420 // Small optimisation: Check for '+' equivalency.
421 else if (token_._min == 1)
422 {
423 token_.set (num_token::ONEORMORE, null_token);
424 repeatn_ = false;
425 }
426 }
427 else
428 {
429 if (ch_ < '0' || ch_ > '9')
430 {
431 std::ostringstream ss_;
432
433 ss_ << "Missing '}' at index " <<
434 state_.index () - 1 << '.';
435 throw runtime_error (ss_.str ().c_str ());
436 }
437
438 min_max_ = true;
439
440 do
441 {
442 token_._max *= 10;
443 token_._max += ch_ - '0';
444 eos_ = state_.next (ch_);
445 } while (!eos_ && ch_ >= '0' && ch_ <= '9');
446
447 if (eos_)
448 {
449 throw runtime_error ("Unexpected end of regex "
450 "(missing '}').");
451 }
452
453 // Small optimisation: Check for '?' equivalency.
454 if (token_._min == 0 && token_._max == 1)
455 {
456 token_.set (num_token::OPT, null_token);
457 repeatn_ = false;
458 }
459 // Small optimisation: if min == max, then min.
460 else if (token_._min == token_._max)
461 {
462 token_._comma = false;
463 min_max_ = false;
464 token_._max = 0;
465 }
466 }
467 }
468
469 if (ch_ != '}')
470 {
471 std::ostringstream ss_;
472
473 ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
474 throw runtime_error (ss_.str ().c_str ());
475 }
476
477 if (repeatn_)
478 {
479 // SEMANTIC VALIDATION follows:
480 // NOTE: {0,} has already become *
481 // therefore we don't check for a comma.
482 if (token_._min == 0 && token_._max == 0)
483 {
484 std::ostringstream ss_;
485
486 ss_ << "Cannot have exactly zero repeats preceding index " <<
487 state_.index () << '.';
488 throw runtime_error (ss_.str ().c_str ());
489 }
490
491 if (min_max_ && token_._max < token_._min)
492 {
493 std::ostringstream ss_;
494
495 ss_ << "Max less than min preceding index " <<
496 state_.index () << '.';
497 throw runtime_error (ss_.str ().c_str ());
498 }
499
500 token_.set (num_token::REPEATN, null_token);
501 }
502 }
503
504 static void macro (state &state_, num_token &token_)
505 {
506 CharT ch_ = 0;
507 bool eos_ = false;
508 const CharT *start_ = state_._curr;
509
510 state_.next (ch_);
511
512 if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
513 !(ch_ >= 'a' && ch_ <= 'z'))
514 {
515 std::ostringstream ss_;
516
517 ss_ << "Invalid MACRO name at index " <<
518 state_.index () - 1 << '.';
519 throw runtime_error (ss_.str ().c_str ());
520 }
521
522 do
523 {
524 eos_ = state_.next (ch_);
525
526 if (eos_)
527 {
528 throw runtime_error ("Unexpected end of regex "
529 "(missing '}').");
530 }
531 } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
532 (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
533
534 if (ch_ != '}')
535 {
536 std::ostringstream ss_;
537
538 ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
539 throw runtime_error (ss_.str ().c_str ());
540 }
541
542 std::size_t len_ = state_._curr - 1 - start_;
543
544 if (len_ > max_macro_len)
545 {
546 std::basic_stringstream<CharT> ss_;
547 std::ostringstream os_;
548
549 os_ << "MACRO name '";
550
551 while (len_)
552 {
553 os_ << ss_.narrow (*start_++, ' ');
554 --len_;
555 }
556
557 os_ << "' too long.";
558 throw runtime_error (os_.str ());
559 }
560
561 token_.set (num_token::MACRO, null_token);
562
563 // Some systems have memcpy in namespace std.
564 using namespace std;
565
566 memcpy (token_._macro, start_, len_ * sizeof (CharT));
567 token_._macro[len_] = 0;
568 }
569 };
570 }
571 }
572 }
573
574 #endif