]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // generator.hpp |
2 | // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
f67539c2 TL |
6 | #ifndef BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP |
7 | #define BOOST_SPIRIT_SUPPORT_DETAIL_LEXER_GENERATOR_HPP | |
7c673cae FG |
8 | |
9 | #include "char_traits.hpp" | |
10 | // memcmp() | |
11 | #include <cstring> | |
12 | #include "partition/charset.hpp" | |
13 | #include "partition/equivset.hpp" | |
14 | #include <memory> | |
15 | #include <limits> | |
16 | #include "parser/tree/node.hpp" | |
17 | #include "parser/parser.hpp" | |
18 | #include "containers/ptr_list.hpp" | |
b32b8144 | 19 | #include <boost/move/unique_ptr.hpp> |
7c673cae FG |
20 | #include "rules.hpp" |
21 | #include "state_machine.hpp" | |
22 | ||
23 | namespace boost | |
24 | { | |
25 | namespace lexer | |
26 | { | |
27 | template<typename CharT, typename Traits = char_traits<CharT> > | |
28 | class basic_generator | |
29 | { | |
30 | public: | |
31 | typedef typename detail::internals::size_t_vector size_t_vector; | |
32 | typedef basic_rules<CharT> rules; | |
33 | ||
34 | static void build (const rules &rules_, | |
35 | basic_state_machine<CharT> &state_machine_) | |
36 | { | |
37 | std::size_t index_ = 0; | |
38 | std::size_t size_ = rules_.statemap ().size (); | |
39 | node_ptr_vector node_ptr_vector_; | |
40 | detail::internals &internals_ = const_cast<detail::internals &> | |
41 | (state_machine_.data ()); | |
42 | bool seen_BOL_assertion_ = false; | |
43 | bool seen_EOL_assertion_ = false; | |
44 | ||
45 | state_machine_.clear (); | |
46 | ||
47 | for (; index_ < size_; ++index_) | |
48 | { | |
49 | internals_._lookup->push_back (static_cast<size_t_vector *>(0)); | |
50 | internals_._lookup->back () = new size_t_vector; | |
51 | internals_._dfa_alphabet.push_back (0); | |
52 | internals_._dfa->push_back (static_cast<size_t_vector *>(0)); | |
53 | internals_._dfa->back () = new size_t_vector; | |
54 | } | |
55 | ||
56 | for (index_ = 0, size_ = internals_._lookup->size (); | |
57 | index_ < size_; ++index_) | |
58 | { | |
59 | internals_._lookup[index_]->resize (sizeof (CharT) == 1 ? | |
60 | num_chars : num_wchar_ts, dead_state_index); | |
61 | ||
62 | if (!rules_.regexes ()[index_].empty ()) | |
63 | { | |
64 | // vector mapping token indexes to partitioned token index sets | |
65 | index_set_vector set_mapping_; | |
66 | // syntax tree | |
67 | detail::node *root_ = build_tree (rules_, index_, | |
68 | node_ptr_vector_, internals_, set_mapping_); | |
69 | ||
70 | build_dfa (root_, set_mapping_, | |
71 | internals_._dfa_alphabet[index_], | |
72 | *internals_._dfa[index_]); | |
73 | ||
74 | if (internals_._seen_BOL_assertion) | |
75 | { | |
76 | seen_BOL_assertion_ = true; | |
77 | } | |
78 | ||
79 | if (internals_._seen_EOL_assertion) | |
80 | { | |
81 | seen_EOL_assertion_ = true; | |
82 | } | |
83 | ||
84 | internals_._seen_BOL_assertion = false; | |
85 | internals_._seen_EOL_assertion = false; | |
86 | } | |
87 | } | |
88 | ||
89 | internals_._seen_BOL_assertion = seen_BOL_assertion_; | |
90 | internals_._seen_EOL_assertion = seen_EOL_assertion_; | |
91 | } | |
92 | ||
93 | static void minimise (basic_state_machine<CharT> &state_machine_) | |
94 | { | |
95 | detail::internals &internals_ = const_cast<detail::internals &> | |
96 | (state_machine_.data ()); | |
97 | const std::size_t machines_ = internals_._dfa->size (); | |
98 | ||
99 | for (std::size_t i_ = 0; i_ < machines_; ++i_) | |
100 | { | |
101 | const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_]; | |
102 | size_t_vector *dfa_ = internals_._dfa[i_]; | |
103 | ||
104 | if (dfa_alphabet_ != 0) | |
105 | { | |
106 | std::size_t size_ = 0; | |
107 | ||
108 | do | |
109 | { | |
110 | size_ = dfa_->size (); | |
111 | minimise_dfa (dfa_alphabet_, *dfa_, size_); | |
112 | } while (dfa_->size () != size_); | |
113 | } | |
114 | } | |
115 | } | |
116 | ||
117 | protected: | |
118 | typedef detail::basic_charset<CharT> charset; | |
119 | typedef detail::ptr_list<charset> charset_list; | |
b32b8144 | 120 | typedef boost::movelib::unique_ptr<charset> charset_ptr; |
7c673cae FG |
121 | typedef detail::equivset equivset; |
122 | typedef detail::ptr_list<equivset> equivset_list; | |
b32b8144 | 123 | typedef boost::movelib::unique_ptr<equivset> equivset_ptr; |
7c673cae FG |
124 | typedef typename charset::index_set index_set; |
125 | typedef std::vector<index_set> index_set_vector; | |
126 | typedef detail::basic_parser<CharT> parser; | |
127 | typedef typename parser::node_ptr_vector node_ptr_vector; | |
128 | typedef std::set<const detail::node *> node_set; | |
129 | typedef detail::ptr_vector<node_set> node_set_vector; | |
130 | typedef std::vector<const detail::node *> node_vector; | |
131 | typedef detail::ptr_vector<node_vector> node_vector_vector; | |
132 | typedef typename parser::string string; | |
133 | typedef std::pair<string, string> string_pair; | |
134 | typedef typename parser::tokeniser::string_token string_token; | |
135 | typedef std::deque<string_pair> macro_deque; | |
136 | typedef std::pair<string, const detail::node *> macro_pair; | |
137 | typedef typename parser::macro_map::iterator macro_iter; | |
138 | typedef std::pair<macro_iter, bool> macro_iter_pair; | |
139 | typedef typename parser::tokeniser::token_map token_map; | |
140 | ||
141 | static detail::node *build_tree (const rules &rules_, | |
142 | const std::size_t state_, node_ptr_vector &node_ptr_vector_, | |
143 | detail::internals &internals_, index_set_vector &set_mapping_) | |
144 | { | |
145 | size_t_vector *lookup_ = internals_._lookup[state_]; | |
146 | const typename rules::string_deque_deque ®exes_ = | |
147 | rules_.regexes (); | |
148 | const typename rules::id_vector_deque &ids_ = rules_.ids (); | |
149 | const typename rules::id_vector_deque &unique_ids_ = | |
150 | rules_.unique_ids (); | |
151 | const typename rules::id_vector_deque &states_ = rules_.states (); | |
152 | typename rules::string_deque::const_iterator regex_iter_ = | |
153 | regexes_[state_].begin (); | |
154 | typename rules::string_deque::const_iterator regex_iter_end_ = | |
155 | regexes_[state_].end (); | |
156 | typename rules::id_vector::const_iterator ids_iter_ = | |
157 | ids_[state_].begin (); | |
158 | typename rules::id_vector::const_iterator unique_ids_iter_ = | |
159 | unique_ids_[state_].begin (); | |
160 | typename rules::id_vector::const_iterator states_iter_ = | |
161 | states_[state_].begin (); | |
162 | const typename rules::string ®ex_ = *regex_iter_; | |
163 | // map of regex charset tokens (strings) to index | |
164 | token_map token_map_; | |
165 | const typename rules::string_pair_deque ¯odeque_ = | |
166 | rules_.macrodeque (); | |
167 | typename parser::macro_map macromap_; | |
168 | typename detail::node::node_vector tree_vector_; | |
169 | ||
170 | build_macros (token_map_, macrodeque_, macromap_, | |
171 | rules_.flags (), rules_.locale (), node_ptr_vector_, | |
172 | internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); | |
173 | ||
174 | detail::node *root_ = parser::parse (regex_.c_str (), | |
175 | regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, | |
176 | *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, | |
177 | macromap_, token_map_, internals_._seen_BOL_assertion, | |
178 | internals_._seen_EOL_assertion); | |
179 | ||
180 | ++regex_iter_; | |
181 | ++ids_iter_; | |
182 | ++unique_ids_iter_; | |
183 | ++states_iter_; | |
184 | tree_vector_.push_back (root_); | |
185 | ||
186 | // build syntax trees | |
187 | while (regex_iter_ != regex_iter_end_) | |
188 | { | |
189 | // re-declare var, otherwise we perform an assignment..! | |
190 | const typename rules::string ®ex2_ = *regex_iter_; | |
191 | ||
192 | root_ = parser::parse (regex2_.c_str (), | |
193 | regex2_.c_str () + regex2_.size (), *ids_iter_, | |
194 | *unique_ids_iter_, *states_iter_, rules_.flags (), | |
195 | rules_.locale (), node_ptr_vector_, macromap_, token_map_, | |
196 | internals_._seen_BOL_assertion, | |
197 | internals_._seen_EOL_assertion); | |
198 | tree_vector_.push_back (root_); | |
199 | ++regex_iter_; | |
200 | ++ids_iter_; | |
201 | ++unique_ids_iter_; | |
202 | ++states_iter_; | |
203 | } | |
204 | ||
205 | if (internals_._seen_BOL_assertion) | |
206 | { | |
207 | // Fixup BOLs | |
208 | typename detail::node::node_vector::iterator iter_ = | |
209 | tree_vector_.begin (); | |
210 | typename detail::node::node_vector::iterator end_ = | |
211 | tree_vector_.end (); | |
212 | ||
213 | for (; iter_ != end_; ++iter_) | |
214 | { | |
215 | fixup_bol (*iter_, node_ptr_vector_); | |
216 | } | |
217 | } | |
218 | ||
219 | // join trees | |
220 | { | |
221 | typename detail::node::node_vector::iterator iter_ = | |
222 | tree_vector_.begin (); | |
223 | typename detail::node::node_vector::iterator end_ = | |
224 | tree_vector_.end (); | |
225 | ||
226 | if (iter_ != end_) | |
227 | { | |
228 | root_ = *iter_; | |
229 | ++iter_; | |
230 | } | |
231 | ||
232 | for (; iter_ != end_; ++iter_) | |
233 | { | |
234 | node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0)); | |
235 | node_ptr_vector_->back () = new detail::selection_node | |
236 | (root_, *iter_); | |
237 | root_ = node_ptr_vector_->back (); | |
238 | } | |
239 | } | |
240 | ||
241 | // partitioned token list | |
242 | charset_list token_list_; | |
243 | ||
244 | set_mapping_.resize (token_map_.size ()); | |
245 | partition_tokens (token_map_, token_list_); | |
246 | ||
247 | typename charset_list::list::const_iterator iter_ = | |
248 | token_list_->begin (); | |
249 | typename charset_list::list::const_iterator end_ = | |
250 | token_list_->end (); | |
251 | std::size_t index_ = 0; | |
252 | ||
253 | for (; iter_ != end_; ++iter_, ++index_) | |
254 | { | |
255 | const charset *cs_ = *iter_; | |
256 | typename charset::index_set::const_iterator set_iter_ = | |
257 | cs_->_index_set.begin (); | |
258 | typename charset::index_set::const_iterator set_end_ = | |
259 | cs_->_index_set.end (); | |
260 | ||
261 | fill_lookup (cs_->_token, lookup_, index_); | |
262 | ||
263 | for (; set_iter_ != set_end_; ++set_iter_) | |
264 | { | |
265 | set_mapping_[*set_iter_].insert (index_); | |
266 | } | |
267 | } | |
268 | ||
269 | internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset; | |
270 | return root_; | |
271 | } | |
272 | ||
273 | static void build_macros (token_map &token_map_, | |
274 | const macro_deque ¯odeque_, | |
275 | typename parser::macro_map ¯omap_, const regex_flags flags_, | |
276 | const std::locale &locale_, node_ptr_vector &node_ptr_vector_, | |
277 | bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) | |
278 | { | |
279 | for (typename macro_deque::const_iterator iter_ = | |
280 | macrodeque_.begin (), end_ = macrodeque_.end (); | |
281 | iter_ != end_; ++iter_) | |
282 | { | |
283 | const typename rules::string &name_ = iter_->first; | |
284 | const typename rules::string ®ex_ = iter_->second; | |
285 | detail::node *node_ = parser::parse (regex_.c_str (), | |
286 | regex_.c_str () + regex_.size (), 0, 0, 0, flags_, | |
287 | locale_, node_ptr_vector_, macromap_, token_map_, | |
288 | seen_BOL_assertion_, seen_EOL_assertion_); | |
289 | macro_iter_pair map_iter_ = macromap_. | |
290 | insert (macro_pair (name_, static_cast<const detail::node *> | |
291 | (0))); | |
292 | ||
293 | map_iter_.first->second = node_; | |
294 | } | |
295 | } | |
296 | ||
297 | static void build_dfa (detail::node *root_, | |
298 | const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, | |
299 | size_t_vector &dfa_) | |
300 | { | |
301 | typename detail::node::node_vector *followpos_ = | |
302 | &root_->firstpos (); | |
303 | node_set_vector seen_sets_; | |
304 | node_vector_vector seen_vectors_; | |
305 | size_t_vector hash_vector_; | |
306 | ||
307 | // 'jam' state | |
308 | dfa_.resize (dfa_alphabet_, 0); | |
309 | closure (followpos_, seen_sets_, seen_vectors_, | |
310 | hash_vector_, dfa_alphabet_, dfa_); | |
311 | ||
312 | std::size_t *ptr_ = 0; | |
313 | ||
314 | for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) | |
315 | { | |
316 | equivset_list equiv_list_; | |
317 | ||
318 | build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); | |
319 | ||
320 | for (typename equivset_list::list::const_iterator iter_ = | |
321 | equiv_list_->begin (), end_ = equiv_list_->end (); | |
322 | iter_ != end_; ++iter_) | |
323 | { | |
324 | equivset *equivset_ = *iter_; | |
325 | const std::size_t transition_ = closure | |
326 | (&equivset_->_followpos, seen_sets_, seen_vectors_, | |
327 | hash_vector_, dfa_alphabet_, dfa_); | |
328 | ||
329 | if (transition_ != npos) | |
330 | { | |
331 | ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); | |
332 | ||
333 | // Prune abstemious transitions from end states. | |
334 | if (*ptr_ && !equivset_->_greedy) continue; | |
335 | ||
336 | for (typename detail::equivset::index_vector::const_iterator | |
337 | equiv_iter_ = equivset_->_index_vector.begin (), | |
338 | equiv_end_ = equivset_->_index_vector.end (); | |
339 | equiv_iter_ != equiv_end_; ++equiv_iter_) | |
340 | { | |
341 | const std::size_t equiv_index_ = *equiv_iter_; | |
342 | ||
343 | if (equiv_index_ == bol_token) | |
344 | { | |
345 | if (ptr_[eol_index] == 0) | |
346 | { | |
347 | ptr_[bol_index] = transition_; | |
348 | } | |
349 | } | |
350 | else if (equiv_index_ == eol_token) | |
351 | { | |
352 | if (ptr_[bol_index] == 0) | |
353 | { | |
354 | ptr_[eol_index] = transition_; | |
355 | } | |
356 | } | |
357 | else | |
358 | { | |
359 | ptr_[equiv_index_ + dfa_offset] = transition_; | |
360 | } | |
361 | } | |
362 | } | |
363 | } | |
364 | } | |
365 | } | |
366 | ||
367 | static std::size_t closure (typename detail::node::node_vector *followpos_, | |
368 | node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, | |
369 | size_t_vector &hash_vector_, const std::size_t size_, | |
370 | size_t_vector &dfa_) | |
371 | { | |
372 | bool end_state_ = false; | |
373 | std::size_t id_ = 0; | |
374 | std::size_t unique_id_ = npos; | |
375 | std::size_t state_ = 0; | |
376 | std::size_t hash_ = 0; | |
377 | ||
378 | if (followpos_->empty ()) return npos; | |
379 | ||
380 | std::size_t index_ = 0; | |
b32b8144 FG |
381 | boost::movelib::unique_ptr<node_set> set_ptr_ (new node_set); |
382 | boost::movelib::unique_ptr<node_vector> vector_ptr_ (new node_vector); | |
7c673cae FG |
383 | |
384 | for (typename detail::node::node_vector::const_iterator iter_ = | |
385 | followpos_->begin (), end_ = followpos_->end (); | |
386 | iter_ != end_; ++iter_) | |
387 | { | |
388 | closure_ex (*iter_, end_state_, id_, unique_id_, state_, | |
389 | set_ptr_.get (), vector_ptr_.get (), hash_); | |
390 | } | |
391 | ||
392 | bool found_ = false; | |
393 | typename size_t_vector::const_iterator hash_iter_ = | |
394 | hash_vector_.begin (); | |
395 | typename size_t_vector::const_iterator hash_end_ = | |
396 | hash_vector_.end (); | |
397 | typename node_set_vector::vector::const_iterator set_iter_ = | |
398 | seen_sets_->begin (); | |
399 | ||
400 | for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) | |
401 | { | |
402 | found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; | |
403 | ++index_; | |
404 | ||
405 | if (found_) break; | |
406 | } | |
407 | ||
408 | if (!found_) | |
409 | { | |
410 | seen_sets_->push_back (static_cast<node_set *>(0)); | |
411 | seen_sets_->back () = set_ptr_.release (); | |
412 | seen_vectors_->push_back (static_cast<node_vector *>(0)); | |
413 | seen_vectors_->back () = vector_ptr_.release (); | |
414 | hash_vector_.push_back (hash_); | |
415 | // State 0 is the jam state... | |
416 | index_ = seen_sets_->size (); | |
417 | ||
418 | const std::size_t old_size_ = dfa_.size (); | |
419 | ||
420 | dfa_.resize (old_size_ + size_, 0); | |
421 | ||
422 | if (end_state_) | |
423 | { | |
424 | dfa_[old_size_] |= end_state; | |
425 | dfa_[old_size_ + id_index] = id_; | |
426 | dfa_[old_size_ + unique_id_index] = unique_id_; | |
427 | dfa_[old_size_ + state_index] = state_; | |
428 | } | |
429 | } | |
430 | ||
431 | return index_; | |
432 | } | |
433 | ||
434 | static void closure_ex (detail::node *node_, bool &end_state_, | |
435 | std::size_t &id_, std::size_t &unique_id_, std::size_t &state_, | |
436 | node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) | |
437 | { | |
438 | const bool temp_end_state_ = node_->end_state (); | |
439 | ||
440 | if (temp_end_state_) | |
441 | { | |
442 | if (!end_state_) | |
443 | { | |
444 | end_state_ = true; | |
445 | id_ = node_->id (); | |
446 | unique_id_ = node_->unique_id (); | |
447 | state_ = node_->lexer_state (); | |
448 | } | |
449 | } | |
450 | ||
451 | if (set_ptr_->insert (node_).second) | |
452 | { | |
453 | vector_ptr_->push_back (node_); | |
454 | hash_ += reinterpret_cast<std::size_t> (node_); | |
455 | } | |
456 | } | |
457 | ||
458 | static void partition_tokens (const token_map &map_, | |
459 | charset_list &lhs_) | |
460 | { | |
461 | charset_list rhs_; | |
462 | ||
463 | fill_rhs_list (map_, rhs_); | |
464 | ||
465 | if (!rhs_->empty ()) | |
466 | { | |
467 | typename charset_list::list::iterator iter_; | |
468 | typename charset_list::list::iterator end_; | |
469 | charset_ptr overlap_ (new charset); | |
470 | ||
471 | lhs_->push_back (static_cast<charset *>(0)); | |
472 | lhs_->back () = rhs_->front (); | |
473 | rhs_->pop_front (); | |
474 | ||
475 | while (!rhs_->empty ()) | |
476 | { | |
477 | charset_ptr r_ (rhs_->front ()); | |
478 | ||
479 | rhs_->pop_front (); | |
480 | iter_ = lhs_->begin (); | |
481 | end_ = lhs_->end (); | |
482 | ||
483 | while (!r_->empty () && iter_ != end_) | |
484 | { | |
485 | typename charset_list::list::iterator l_iter_ = iter_; | |
486 | ||
487 | (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); | |
488 | ||
489 | if (overlap_->empty ()) | |
490 | { | |
491 | ++iter_; | |
492 | } | |
493 | else if ((*l_iter_)->empty ()) | |
494 | { | |
495 | delete *l_iter_; | |
496 | *l_iter_ = overlap_.release (); | |
497 | ||
b32b8144 | 498 | overlap_.reset (new charset); |
7c673cae FG |
499 | ++iter_; |
500 | } | |
501 | else if (r_->empty ()) | |
502 | { | |
b32b8144 FG |
503 | overlap_.swap (r_); |
504 | overlap_.reset (new charset); | |
7c673cae FG |
505 | break; |
506 | } | |
507 | else | |
508 | { | |
509 | iter_ = lhs_->insert (++iter_, | |
510 | static_cast<charset *>(0)); | |
511 | *iter_ = overlap_.release (); | |
512 | ||
b32b8144 | 513 | overlap_.reset(new charset); |
7c673cae FG |
514 | ++iter_; |
515 | end_ = lhs_->end (); | |
516 | } | |
517 | } | |
518 | ||
519 | if (!r_->empty ()) | |
520 | { | |
521 | lhs_->push_back (static_cast<charset *>(0)); | |
522 | lhs_->back () = r_.release (); | |
523 | } | |
524 | } | |
525 | } | |
526 | } | |
527 | ||
528 | static void fill_rhs_list (const token_map &map_, | |
529 | charset_list &list_) | |
530 | { | |
531 | typename parser::tokeniser::token_map::const_iterator iter_ = | |
532 | map_.begin (); | |
533 | typename parser::tokeniser::token_map::const_iterator end_ = | |
534 | map_.end (); | |
535 | ||
536 | for (; iter_ != end_; ++iter_) | |
537 | { | |
538 | list_->push_back (static_cast<charset *>(0)); | |
539 | list_->back () = new charset (iter_->first, iter_->second); | |
540 | } | |
541 | } | |
542 | ||
543 | static void fill_lookup (const string_token &token_, | |
544 | size_t_vector *lookup_, const std::size_t index_) | |
545 | { | |
546 | const CharT *curr_ = token_._charset.c_str (); | |
547 | const CharT *chars_end_ = curr_ + token_._charset.size (); | |
548 | std::size_t *ptr_ = &lookup_->front (); | |
549 | const std::size_t max_ = sizeof (CharT) == 1 ? | |
550 | num_chars : num_wchar_ts; | |
551 | ||
552 | if (token_._negated) | |
553 | { | |
554 | // $$$ FIXME JDG July 2014 $$$ | |
555 | // this code is problematic on platforms where wchar_t is signed | |
556 | // with min generating negative numbers. This crashes with BAD_ACCESS | |
557 | // because of the vector index below: | |
558 | // ptr_[static_cast<typename Traits::index_type>(curr_char_)] | |
559 | CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)(); | |
560 | std::size_t i_ = 0; | |
561 | ||
562 | while (curr_ < chars_end_) | |
563 | { | |
564 | while (*curr_ > curr_char_) | |
565 | { | |
566 | ptr_[static_cast<typename Traits::index_type> | |
567 | (curr_char_)] = index_ + dfa_offset; | |
568 | ++curr_char_; | |
569 | ++i_; | |
570 | } | |
571 | ||
572 | ++curr_char_; | |
573 | ++curr_; | |
574 | ++i_; | |
575 | } | |
576 | ||
577 | for (; i_ < max_; ++i_) | |
578 | { | |
579 | ptr_[static_cast<typename Traits::index_type>(curr_char_)] = | |
580 | index_ + dfa_offset; | |
581 | ++curr_char_; | |
582 | } | |
583 | } | |
584 | else | |
585 | { | |
586 | while (curr_ < chars_end_) | |
587 | { | |
588 | ptr_[static_cast<typename Traits::index_type>(*curr_)] = | |
589 | index_ + dfa_offset; | |
590 | ++curr_; | |
591 | } | |
592 | } | |
593 | } | |
594 | ||
595 | static void build_equiv_list (const node_vector *vector_, | |
596 | const index_set_vector &set_mapping_, equivset_list &lhs_) | |
597 | { | |
598 | equivset_list rhs_; | |
599 | ||
600 | fill_rhs_list (vector_, set_mapping_, rhs_); | |
601 | ||
602 | if (!rhs_->empty ()) | |
603 | { | |
604 | typename equivset_list::list::iterator iter_; | |
605 | typename equivset_list::list::iterator end_; | |
606 | equivset_ptr overlap_ (new equivset); | |
607 | ||
608 | lhs_->push_back (static_cast<equivset *>(0)); | |
609 | lhs_->back () = rhs_->front (); | |
610 | rhs_->pop_front (); | |
611 | ||
612 | while (!rhs_->empty ()) | |
613 | { | |
614 | equivset_ptr r_ (rhs_->front ()); | |
615 | ||
616 | rhs_->pop_front (); | |
617 | iter_ = lhs_->begin (); | |
618 | end_ = lhs_->end (); | |
619 | ||
620 | while (!r_->empty () && iter_ != end_) | |
621 | { | |
622 | typename equivset_list::list::iterator l_iter_ = iter_; | |
623 | ||
624 | (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); | |
625 | ||
626 | if (overlap_->empty ()) | |
627 | { | |
628 | ++iter_; | |
629 | } | |
630 | else if ((*l_iter_)->empty ()) | |
631 | { | |
632 | delete *l_iter_; | |
633 | *l_iter_ = overlap_.release (); | |
634 | ||
b32b8144 | 635 | overlap_.reset (new equivset); |
7c673cae FG |
636 | ++iter_; |
637 | } | |
638 | else if (r_->empty ()) | |
639 | { | |
b32b8144 FG |
640 | overlap_.swap (r_); |
641 | overlap_.reset (new equivset); | |
7c673cae FG |
642 | break; |
643 | } | |
644 | else | |
645 | { | |
646 | iter_ = lhs_->insert (++iter_, | |
647 | static_cast<equivset *>(0)); | |
648 | *iter_ = overlap_.release (); | |
649 | ||
b32b8144 | 650 | overlap_.reset (new equivset); |
7c673cae FG |
651 | ++iter_; |
652 | end_ = lhs_->end (); | |
653 | } | |
654 | } | |
655 | ||
656 | if (!r_->empty ()) | |
657 | { | |
658 | lhs_->push_back (static_cast<equivset *>(0)); | |
659 | lhs_->back () = r_.release (); | |
660 | } | |
661 | } | |
662 | } | |
663 | } | |
664 | ||
665 | static void fill_rhs_list (const node_vector *vector_, | |
666 | const index_set_vector &set_mapping_, equivset_list &list_) | |
667 | { | |
668 | typename node_vector::const_iterator iter_ = | |
669 | vector_->begin (); | |
670 | typename node_vector::const_iterator end_ = | |
671 | vector_->end (); | |
672 | ||
673 | for (; iter_ != end_; ++iter_) | |
674 | { | |
675 | const detail::node *node_ = *iter_; | |
676 | ||
677 | if (!node_->end_state ()) | |
678 | { | |
679 | const std::size_t token_ = node_->token (); | |
680 | ||
681 | if (token_ != null_token) | |
682 | { | |
683 | list_->push_back (static_cast<equivset *>(0)); | |
684 | ||
685 | if (token_ == bol_token || token_ == eol_token) | |
686 | { | |
687 | std::set<std::size_t> index_set_; | |
688 | ||
689 | index_set_.insert (token_); | |
690 | list_->back () = new equivset (index_set_, | |
691 | node_->greedy (), token_, node_->followpos ()); | |
692 | } | |
693 | else | |
694 | { | |
695 | list_->back () = new equivset (set_mapping_[token_], | |
696 | node_->greedy (), token_, node_->followpos ()); | |
697 | } | |
698 | } | |
699 | } | |
700 | } | |
701 | } | |
702 | ||
703 | static void fixup_bol (detail::node * &root_, | |
704 | node_ptr_vector &node_ptr_vector_) | |
705 | { | |
706 | typename detail::node::node_vector *first_ = &root_->firstpos (); | |
707 | bool found_ = false; | |
708 | typename detail::node::node_vector::const_iterator iter_ = | |
709 | first_->begin (); | |
710 | typename detail::node::node_vector::const_iterator end_ = | |
711 | first_->end (); | |
712 | ||
713 | for (; iter_ != end_; ++iter_) | |
714 | { | |
715 | const detail::node *node_ = *iter_; | |
716 | ||
717 | found_ = !node_->end_state () && node_->token () == bol_token; | |
718 | ||
719 | if (found_) break; | |
720 | } | |
721 | ||
722 | if (!found_) | |
723 | { | |
724 | node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); | |
725 | node_ptr_vector_->back () = new detail::leaf_node | |
726 | (bol_token, true); | |
727 | ||
728 | detail::node *lhs_ = node_ptr_vector_->back (); | |
729 | ||
730 | node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); | |
731 | node_ptr_vector_->back () = new detail::leaf_node | |
732 | (null_token, true); | |
733 | ||
734 | detail::node *rhs_ = node_ptr_vector_->back (); | |
735 | ||
736 | node_ptr_vector_->push_back | |
737 | (static_cast<detail::selection_node *>(0)); | |
738 | node_ptr_vector_->back () = | |
739 | new detail::selection_node (lhs_, rhs_); | |
740 | lhs_ = node_ptr_vector_->back (); | |
741 | ||
742 | node_ptr_vector_->push_back | |
743 | (static_cast<detail::sequence_node *>(0)); | |
744 | node_ptr_vector_->back () = | |
745 | new detail::sequence_node (lhs_, root_); | |
746 | root_ = node_ptr_vector_->back (); | |
747 | } | |
748 | } | |
749 | ||
750 | static void minimise_dfa (const std::size_t dfa_alphabet_, | |
751 | size_t_vector &dfa_, std::size_t size_) | |
752 | { | |
753 | const std::size_t *first_ = &dfa_.front (); | |
754 | const std::size_t *second_ = 0; | |
755 | const std::size_t *end_ = first_ + size_; | |
756 | std::size_t index_ = 1; | |
757 | std::size_t new_index_ = 1; | |
758 | std::size_t curr_index_ = 0; | |
759 | index_set index_set_; | |
760 | size_t_vector lookup_; | |
761 | std::size_t *lookup_ptr_ = 0; | |
762 | ||
763 | lookup_.resize (size_ / dfa_alphabet_, null_token); | |
764 | lookup_ptr_ = &lookup_.front (); | |
765 | *lookup_ptr_ = 0; | |
766 | // Only one 'jam' state, so skip it. | |
767 | first_ += dfa_alphabet_; | |
768 | ||
769 | for (; first_ < end_; first_ += dfa_alphabet_, ++index_) | |
770 | { | |
771 | for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1; | |
772 | second_ < end_; second_ += dfa_alphabet_, ++curr_index_) | |
773 | { | |
774 | if (index_set_.find (curr_index_) != index_set_.end ()) | |
775 | { | |
776 | continue; | |
777 | } | |
778 | ||
779 | // Some systems have memcmp in namespace std. | |
780 | using namespace std; | |
781 | ||
782 | if (memcmp (first_, second_, sizeof (std::size_t) * | |
783 | dfa_alphabet_) == 0) | |
784 | { | |
785 | index_set_.insert (curr_index_); | |
786 | lookup_ptr_[curr_index_] = new_index_; | |
787 | } | |
788 | } | |
789 | ||
790 | if (lookup_ptr_[index_] == null_token) | |
791 | { | |
792 | lookup_ptr_[index_] = new_index_; | |
793 | ++new_index_; | |
794 | } | |
795 | } | |
796 | ||
797 | if (!index_set_.empty ()) | |
798 | { | |
799 | const std::size_t *front_ = &dfa_.front (); | |
800 | size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_); | |
801 | typename index_set::iterator set_end_ = | |
802 | index_set_.end (); | |
803 | const std::size_t *ptr_ = front_ + dfa_alphabet_; | |
804 | std::size_t *new_ptr_ = 0; | |
805 | ||
806 | new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); | |
807 | new_ptr_ = &new_dfa_.front () + dfa_alphabet_; | |
808 | size_ /= dfa_alphabet_; | |
809 | ||
810 | for (index_ = 1; index_ < size_; ++index_) | |
811 | { | |
812 | if (index_set_.find (index_) != set_end_) | |
813 | { | |
814 | ptr_ += dfa_alphabet_; | |
815 | continue; | |
816 | } | |
817 | ||
818 | new_ptr_[end_state_index] = ptr_[end_state_index]; | |
819 | new_ptr_[id_index] = ptr_[id_index]; | |
820 | new_ptr_[unique_id_index] = ptr_[unique_id_index]; | |
821 | new_ptr_[state_index] = ptr_[state_index]; | |
822 | new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]]; | |
823 | new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; | |
824 | new_ptr_ += dfa_offset; | |
825 | ptr_ += dfa_offset; | |
826 | ||
827 | for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_) | |
828 | { | |
829 | *new_ptr_++ = lookup_ptr_[*ptr_++]; | |
830 | } | |
831 | } | |
832 | ||
833 | dfa_.swap (new_dfa_); | |
834 | } | |
835 | } | |
836 | }; | |
837 | ||
838 | typedef basic_generator<char> generator; | |
839 | typedef basic_generator<wchar_t> wgenerator; | |
840 | } | |
841 | } | |
842 | ||
843 | #endif |