]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // input.hpp |
2 | // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_INPUT | |
7 | #define BOOST_LEXER_INPUT | |
8 | ||
9 | #include "char_traits.hpp" | |
10 | #include <boost/detail/iterator.hpp> | |
11 | #include "size_t.hpp" | |
12 | #include "state_machine.hpp" | |
13 | ||
14 | namespace boost | |
15 | { | |
16 | namespace lexer | |
17 | { | |
18 | template<typename FwdIter, typename Traits = | |
19 | char_traits<typename boost::detail::iterator_traits<FwdIter>::value_type> > | |
20 | class basic_input | |
21 | { | |
22 | public: | |
23 | class iterator | |
24 | { | |
25 | public: | |
26 | friend class basic_input; | |
27 | ||
28 | struct data | |
29 | { | |
30 | std::size_t id; | |
31 | std::size_t unique_id; | |
32 | FwdIter start; | |
33 | FwdIter end; | |
34 | bool bol; | |
35 | std::size_t state; | |
36 | ||
37 | // Construct in end() state. | |
38 | data () : | |
39 | id (0), | |
40 | unique_id (npos), | |
41 | bol (false), | |
42 | state (npos) | |
43 | { | |
44 | } | |
45 | ||
46 | bool operator == (const data &rhs_) const | |
47 | { | |
48 | return id == rhs_.id && unique_id == rhs_.unique_id && | |
49 | start == rhs_.start && end == rhs_.end && | |
50 | bol == rhs_.bol && state == rhs_.state; | |
51 | } | |
52 | }; | |
53 | ||
54 | iterator () : | |
55 | _input (0) | |
56 | { | |
57 | } | |
58 | ||
59 | bool operator == (const iterator &rhs_) const | |
60 | { | |
61 | return _data == rhs_._data; | |
62 | } | |
63 | ||
64 | bool operator != (const iterator &rhs_) const | |
65 | { | |
66 | return !(*this == rhs_); | |
67 | } | |
68 | ||
69 | data &operator * () | |
70 | { | |
71 | return _data; | |
72 | } | |
73 | ||
74 | data *operator -> () | |
75 | { | |
76 | return &_data; | |
77 | } | |
78 | ||
79 | // Let compiler generate operator = (). | |
80 | ||
81 | // prefix version | |
82 | iterator &operator ++ () | |
83 | { | |
84 | next_token (); | |
85 | return *this; | |
86 | } | |
87 | ||
88 | // postfix version | |
89 | iterator operator ++ (int) | |
90 | { | |
91 | iterator iter_ = *this; | |
92 | ||
93 | next_token (); | |
94 | return iter_; | |
95 | } | |
96 | ||
97 | private: | |
98 | // Not owner (obviously!) | |
99 | const basic_input *_input; | |
100 | data _data; | |
101 | ||
102 | void next_token () | |
103 | { | |
104 | const detail::internals &internals_ = | |
105 | _input->_state_machine->data (); | |
106 | ||
107 | _data.start = _data.end; | |
108 | ||
109 | if (internals_._dfa->size () == 1) | |
110 | { | |
111 | if (internals_._seen_BOL_assertion || | |
112 | internals_._seen_EOL_assertion) | |
113 | { | |
114 | _data.id = next | |
115 | (&internals_._lookup->front ()->front (), | |
116 | internals_._dfa_alphabet.front (), | |
117 | &internals_._dfa->front ()->front (), | |
118 | _data.bol, _data.end, _input->_end, _data.unique_id); | |
119 | } | |
120 | else | |
121 | { | |
122 | _data.id = next (&internals_._lookup->front ()->front (), | |
123 | internals_._dfa_alphabet.front (), &internals_. | |
124 | _dfa->front ()->front (), _data.end, _input->_end, | |
125 | _data.unique_id); | |
126 | } | |
127 | } | |
128 | else | |
129 | { | |
130 | if (internals_._seen_BOL_assertion || | |
131 | internals_._seen_EOL_assertion) | |
132 | { | |
133 | _data.id = next (internals_, _data.state, | |
134 | _data.bol, _data.end, _input->_end, _data.unique_id); | |
135 | } | |
136 | else | |
137 | { | |
138 | _data.id = next (internals_, _data.state, | |
139 | _data.end, _input->_end, _data.unique_id); | |
140 | } | |
141 | } | |
142 | ||
143 | if (_data.end == _input->_end && _data.start == _data.end) | |
144 | { | |
145 | // Ensure current state matches that returned by end(). | |
146 | _data.state = npos; | |
147 | } | |
148 | } | |
149 | ||
150 | std::size_t next (const detail::internals &internals_, | |
151 | std::size_t &start_state_, bool bol_, | |
152 | FwdIter &start_token_, const FwdIter &end_, | |
153 | std::size_t &unique_id_) | |
154 | { | |
155 | if (start_token_ == end_) | |
156 | { | |
157 | unique_id_ = npos; | |
158 | return 0; | |
159 | } | |
160 | ||
161 | again: | |
162 | const std::size_t * lookup_ = &internals_._lookup[start_state_]-> | |
163 | front (); | |
164 | std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; | |
165 | const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); | |
166 | const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
167 | FwdIter curr_ = start_token_; | |
168 | bool end_state_ = *ptr_ != 0; | |
169 | std::size_t id_ = *(ptr_ + id_index); | |
170 | std::size_t uid_ = *(ptr_ + unique_id_index); | |
171 | std::size_t end_start_state_ = start_state_; | |
172 | bool end_bol_ = bol_; | |
173 | FwdIter end_token_ = start_token_; | |
174 | ||
175 | while (curr_ != end_) | |
176 | { | |
177 | const std::size_t BOL_state_ = ptr_[bol_index]; | |
178 | const std::size_t EOL_state_ = ptr_[eol_index]; | |
179 | ||
180 | if (BOL_state_ && bol_) | |
181 | { | |
182 | ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; | |
183 | } | |
184 | else if (EOL_state_ && *curr_ == '\n') | |
185 | { | |
186 | ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
187 | } | |
188 | else | |
189 | { | |
190 | typename Traits::char_type prev_char_ = *curr_++; | |
191 | ||
192 | bol_ = prev_char_ == '\n'; | |
193 | ||
194 | const std::size_t state_ = | |
195 | ptr_[lookup_[static_cast<typename Traits::index_type> | |
196 | (prev_char_)]]; | |
197 | ||
198 | if (state_ == 0) | |
199 | { | |
200 | break; | |
201 | } | |
202 | ||
203 | ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
204 | } | |
205 | ||
206 | if (*ptr_) | |
207 | { | |
208 | end_state_ = true; | |
209 | id_ = *(ptr_ + id_index); | |
210 | uid_ = *(ptr_ + unique_id_index); | |
211 | end_start_state_ = *(ptr_ + state_index); | |
212 | end_bol_ = bol_; | |
213 | end_token_ = curr_; | |
214 | } | |
215 | } | |
216 | ||
217 | const std::size_t EOL_state_ = ptr_[eol_index]; | |
218 | ||
219 | if (EOL_state_ && curr_ == end_) | |
220 | { | |
221 | ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
222 | ||
223 | if (*ptr_) | |
224 | { | |
225 | end_state_ = true; | |
226 | id_ = *(ptr_ + id_index); | |
227 | uid_ = *(ptr_ + unique_id_index); | |
228 | end_start_state_ = *(ptr_ + state_index); | |
229 | end_bol_ = bol_; | |
230 | end_token_ = curr_; | |
231 | } | |
232 | } | |
233 | ||
234 | if (end_state_) | |
235 | { | |
236 | // return longest match | |
237 | start_state_ = end_start_state_; | |
238 | start_token_ = end_token_; | |
239 | ||
240 | if (id_ == 0) | |
241 | { | |
242 | bol_ = end_bol_; | |
243 | goto again; | |
244 | } | |
245 | else | |
246 | { | |
247 | _data.bol = end_bol_; | |
248 | } | |
249 | } | |
250 | else | |
251 | { | |
252 | // No match causes char to be skipped | |
253 | _data.bol = *start_token_ == '\n'; | |
254 | ++start_token_; | |
255 | id_ = npos; | |
256 | uid_ = npos; | |
257 | } | |
258 | ||
259 | unique_id_ = uid_; | |
260 | return id_; | |
261 | } | |
262 | ||
263 | std::size_t next (const detail::internals &internals_, | |
264 | std::size_t &start_state_, FwdIter &start_token_, | |
265 | FwdIter const &end_, std::size_t &unique_id_) | |
266 | { | |
267 | if (start_token_ == end_) | |
268 | { | |
269 | unique_id_ = npos; | |
270 | return 0; | |
271 | } | |
272 | ||
273 | again: | |
274 | const std::size_t * lookup_ = &internals_._lookup[start_state_]-> | |
275 | front (); | |
276 | std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_]; | |
277 | const std::size_t *dfa_ = &internals_._dfa[start_state_]->front (); | |
278 | const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
279 | FwdIter curr_ = start_token_; | |
280 | bool end_state_ = *ptr_ != 0; | |
281 | std::size_t id_ = *(ptr_ + id_index); | |
282 | std::size_t uid_ = *(ptr_ + unique_id_index); | |
283 | std::size_t end_start_state_ = start_state_; | |
284 | FwdIter end_token_ = start_token_; | |
285 | ||
286 | while (curr_ != end_) | |
287 | { | |
288 | const std::size_t state_ = ptr_[lookup_[static_cast | |
289 | <typename Traits::index_type>(*curr_++)]]; | |
290 | ||
291 | if (state_ == 0) | |
292 | { | |
293 | break; | |
294 | } | |
295 | ||
296 | ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
297 | ||
298 | if (*ptr_) | |
299 | { | |
300 | end_state_ = true; | |
301 | id_ = *(ptr_ + id_index); | |
302 | uid_ = *(ptr_ + unique_id_index); | |
303 | end_start_state_ = *(ptr_ + state_index); | |
304 | end_token_ = curr_; | |
305 | } | |
306 | } | |
307 | ||
308 | if (end_state_) | |
309 | { | |
310 | // return longest match | |
311 | start_state_ = end_start_state_; | |
312 | start_token_ = end_token_; | |
313 | ||
314 | if (id_ == 0) goto again; | |
315 | } | |
316 | else | |
317 | { | |
318 | // No match causes char to be skipped | |
319 | ++start_token_; | |
320 | id_ = npos; | |
321 | uid_ = npos; | |
322 | } | |
323 | ||
324 | unique_id_ = uid_; | |
325 | return id_; | |
326 | } | |
327 | ||
328 | std::size_t next (const std::size_t * const lookup_, | |
329 | const std::size_t dfa_alphabet_, const std::size_t * const dfa_, | |
330 | bool bol_, FwdIter &start_token_, FwdIter const &end_, | |
331 | std::size_t &unique_id_) | |
332 | { | |
333 | if (start_token_ == end_) | |
334 | { | |
335 | unique_id_ = npos; | |
336 | return 0; | |
337 | } | |
338 | ||
339 | const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
340 | FwdIter curr_ = start_token_; | |
341 | bool end_state_ = *ptr_ != 0; | |
342 | std::size_t id_ = *(ptr_ + id_index); | |
343 | std::size_t uid_ = *(ptr_ + unique_id_index); | |
344 | bool end_bol_ = bol_; | |
345 | FwdIter end_token_ = start_token_; | |
346 | ||
347 | while (curr_ != end_) | |
348 | { | |
349 | const std::size_t BOL_state_ = ptr_[bol_index]; | |
350 | const std::size_t EOL_state_ = ptr_[eol_index]; | |
351 | ||
352 | if (BOL_state_ && bol_) | |
353 | { | |
354 | ptr_ = &dfa_[BOL_state_ * dfa_alphabet_]; | |
355 | } | |
356 | else if (EOL_state_ && *curr_ == '\n') | |
357 | { | |
358 | ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
359 | } | |
360 | else | |
361 | { | |
362 | typename Traits::char_type prev_char_ = *curr_++; | |
363 | ||
364 | bol_ = prev_char_ == '\n'; | |
365 | ||
366 | const std::size_t state_ = | |
367 | ptr_[lookup_[static_cast<typename Traits::index_type> | |
368 | (prev_char_)]]; | |
369 | ||
370 | if (state_ == 0) | |
371 | { | |
372 | break; | |
373 | } | |
374 | ||
375 | ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
376 | } | |
377 | ||
378 | if (*ptr_) | |
379 | { | |
380 | end_state_ = true; | |
381 | id_ = *(ptr_ + id_index); | |
382 | uid_ = *(ptr_ + unique_id_index); | |
383 | end_bol_ = bol_; | |
384 | end_token_ = curr_; | |
385 | } | |
386 | } | |
387 | ||
388 | const std::size_t EOL_state_ = ptr_[eol_index]; | |
389 | ||
390 | if (EOL_state_ && curr_ == end_) | |
391 | { | |
392 | ptr_ = &dfa_[EOL_state_ * dfa_alphabet_]; | |
393 | ||
394 | if (*ptr_) | |
395 | { | |
396 | end_state_ = true; | |
397 | id_ = *(ptr_ + id_index); | |
398 | uid_ = *(ptr_ + unique_id_index); | |
399 | end_bol_ = bol_; | |
400 | end_token_ = curr_; | |
401 | } | |
402 | } | |
403 | ||
404 | if (end_state_) | |
405 | { | |
406 | // return longest match | |
407 | _data.bol = end_bol_; | |
408 | start_token_ = end_token_; | |
409 | } | |
410 | else | |
411 | { | |
412 | // No match causes char to be skipped | |
413 | _data.bol = *start_token_ == '\n'; | |
414 | ++start_token_; | |
415 | id_ = npos; | |
416 | uid_ = npos; | |
417 | } | |
418 | ||
419 | unique_id_ = uid_; | |
420 | return id_; | |
421 | } | |
422 | ||
423 | std::size_t next (const std::size_t * const lookup_, | |
424 | const std::size_t dfa_alphabet_, const std::size_t * const dfa_, | |
425 | FwdIter &start_token_, FwdIter const &end_, | |
426 | std::size_t &unique_id_) | |
427 | { | |
428 | if (start_token_ == end_) | |
429 | { | |
430 | unique_id_ = npos; | |
431 | return 0; | |
432 | } | |
433 | ||
434 | const std::size_t *ptr_ = dfa_ + dfa_alphabet_; | |
435 | FwdIter curr_ = start_token_; | |
436 | bool end_state_ = *ptr_ != 0; | |
437 | std::size_t id_ = *(ptr_ + id_index); | |
438 | std::size_t uid_ = *(ptr_ + unique_id_index); | |
439 | FwdIter end_token_ = start_token_; | |
440 | ||
441 | while (curr_ != end_) | |
442 | { | |
443 | const std::size_t state_ = ptr_[lookup_[static_cast | |
444 | <typename Traits::index_type>(*curr_++)]]; | |
445 | ||
446 | if (state_ == 0) | |
447 | { | |
448 | break; | |
449 | } | |
450 | ||
451 | ptr_ = &dfa_[state_ * dfa_alphabet_]; | |
452 | ||
453 | if (*ptr_) | |
454 | { | |
455 | end_state_ = true; | |
456 | id_ = *(ptr_ + id_index); | |
457 | uid_ = *(ptr_ + unique_id_index); | |
458 | end_token_ = curr_; | |
459 | } | |
460 | } | |
461 | ||
462 | if (end_state_) | |
463 | { | |
464 | // return longest match | |
465 | start_token_ = end_token_; | |
466 | } | |
467 | else | |
468 | { | |
469 | // No match causes char to be skipped | |
470 | ++start_token_; | |
471 | id_ = npos; | |
472 | uid_ = npos; | |
473 | } | |
474 | ||
475 | unique_id_ = uid_; | |
476 | return id_; | |
477 | } | |
478 | }; | |
479 | ||
480 | friend class iterator; | |
481 | ||
482 | // Make it explict that we are NOT taking a copy of state_machine_! | |
483 | basic_input (const basic_state_machine<typename Traits::char_type> | |
484 | *state_machine_, const FwdIter &begin_, const FwdIter &end_) : | |
485 | _state_machine (state_machine_), | |
486 | _begin (begin_), | |
487 | _end (end_) | |
488 | { | |
489 | } | |
490 | ||
491 | iterator begin () const | |
492 | { | |
493 | iterator iter_; | |
494 | ||
495 | iter_._input = this; | |
496 | // Over-ride default of 0 (EOI) | |
497 | iter_._data.id = npos; | |
498 | iter_._data.start = _begin; | |
499 | iter_._data.end = _begin; | |
500 | iter_._data.bol = _state_machine->data ()._seen_BOL_assertion; | |
501 | iter_._data.state = 0; | |
502 | ++iter_; | |
503 | return iter_; | |
504 | } | |
505 | ||
506 | iterator end () const | |
507 | { | |
508 | iterator iter_; | |
509 | ||
510 | iter_._input = this; | |
511 | iter_._data.start = _end; | |
512 | iter_._data.end = _end; | |
513 | return iter_; | |
514 | } | |
515 | ||
516 | private: | |
517 | const basic_state_machine<typename Traits::char_type> *_state_machine; | |
518 | FwdIter _begin; | |
519 | FwdIter _end; | |
520 | }; | |
521 | ||
522 | typedef basic_input<std::string::iterator> iter_input; | |
523 | typedef basic_input<std::basic_string<wchar_t>::iterator> iter_winput; | |
524 | typedef basic_input<const char *> ptr_input; | |
525 | typedef basic_input<const wchar_t *> ptr_winput; | |
526 | } | |
527 | } | |
528 | ||
529 | #endif |