]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // generator.hpp |
2 | // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_GENERATOR_HPP | |
7 | #define BOOST_LEXER_GENERATOR_HPP | |
8 | ||
9 | #include "char_traits.hpp" | |
10 | // memcmp() | |
11 | #include <cstring> | |
12 | #include "partition/charset.hpp" | |
13 | #include "partition/equivset.hpp" | |
14 | #include <memory> | |
15 | #include <limits> | |
16 | #include "parser/tree/node.hpp" | |
17 | #include "parser/parser.hpp" | |
18 | #include "containers/ptr_list.hpp" | |
19 | #include "rules.hpp" | |
20 | #include "state_machine.hpp" | |
21 | ||
22 | namespace boost | |
23 | { | |
24 | namespace lexer | |
25 | { | |
26 | template<typename CharT, typename Traits = char_traits<CharT> > | |
27 | class basic_generator | |
28 | { | |
29 | public: | |
30 | typedef typename detail::internals::size_t_vector size_t_vector; | |
31 | typedef basic_rules<CharT> rules; | |
32 | ||
33 | static void build (const rules &rules_, | |
34 | basic_state_machine<CharT> &state_machine_) | |
35 | { | |
36 | std::size_t index_ = 0; | |
37 | std::size_t size_ = rules_.statemap ().size (); | |
38 | node_ptr_vector node_ptr_vector_; | |
39 | detail::internals &internals_ = const_cast<detail::internals &> | |
40 | (state_machine_.data ()); | |
41 | bool seen_BOL_assertion_ = false; | |
42 | bool seen_EOL_assertion_ = false; | |
43 | ||
44 | state_machine_.clear (); | |
45 | ||
46 | for (; index_ < size_; ++index_) | |
47 | { | |
48 | internals_._lookup->push_back (static_cast<size_t_vector *>(0)); | |
49 | internals_._lookup->back () = new size_t_vector; | |
50 | internals_._dfa_alphabet.push_back (0); | |
51 | internals_._dfa->push_back (static_cast<size_t_vector *>(0)); | |
52 | internals_._dfa->back () = new size_t_vector; | |
53 | } | |
54 | ||
55 | for (index_ = 0, size_ = internals_._lookup->size (); | |
56 | index_ < size_; ++index_) | |
57 | { | |
58 | internals_._lookup[index_]->resize (sizeof (CharT) == 1 ? | |
59 | num_chars : num_wchar_ts, dead_state_index); | |
60 | ||
61 | if (!rules_.regexes ()[index_].empty ()) | |
62 | { | |
63 | // vector mapping token indexes to partitioned token index sets | |
64 | index_set_vector set_mapping_; | |
65 | // syntax tree | |
66 | detail::node *root_ = build_tree (rules_, index_, | |
67 | node_ptr_vector_, internals_, set_mapping_); | |
68 | ||
69 | build_dfa (root_, set_mapping_, | |
70 | internals_._dfa_alphabet[index_], | |
71 | *internals_._dfa[index_]); | |
72 | ||
73 | if (internals_._seen_BOL_assertion) | |
74 | { | |
75 | seen_BOL_assertion_ = true; | |
76 | } | |
77 | ||
78 | if (internals_._seen_EOL_assertion) | |
79 | { | |
80 | seen_EOL_assertion_ = true; | |
81 | } | |
82 | ||
83 | internals_._seen_BOL_assertion = false; | |
84 | internals_._seen_EOL_assertion = false; | |
85 | } | |
86 | } | |
87 | ||
88 | internals_._seen_BOL_assertion = seen_BOL_assertion_; | |
89 | internals_._seen_EOL_assertion = seen_EOL_assertion_; | |
90 | } | |
91 | ||
92 | static void minimise (basic_state_machine<CharT> &state_machine_) | |
93 | { | |
94 | detail::internals &internals_ = const_cast<detail::internals &> | |
95 | (state_machine_.data ()); | |
96 | const std::size_t machines_ = internals_._dfa->size (); | |
97 | ||
98 | for (std::size_t i_ = 0; i_ < machines_; ++i_) | |
99 | { | |
100 | const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_]; | |
101 | size_t_vector *dfa_ = internals_._dfa[i_]; | |
102 | ||
103 | if (dfa_alphabet_ != 0) | |
104 | { | |
105 | std::size_t size_ = 0; | |
106 | ||
107 | do | |
108 | { | |
109 | size_ = dfa_->size (); | |
110 | minimise_dfa (dfa_alphabet_, *dfa_, size_); | |
111 | } while (dfa_->size () != size_); | |
112 | } | |
113 | } | |
114 | } | |
115 | ||
116 | protected: | |
117 | typedef detail::basic_charset<CharT> charset; | |
118 | typedef detail::ptr_list<charset> charset_list; | |
119 | typedef std::auto_ptr<charset> charset_ptr; | |
120 | typedef detail::equivset equivset; | |
121 | typedef detail::ptr_list<equivset> equivset_list; | |
122 | typedef std::auto_ptr<equivset> equivset_ptr; | |
123 | typedef typename charset::index_set index_set; | |
124 | typedef std::vector<index_set> index_set_vector; | |
125 | typedef detail::basic_parser<CharT> parser; | |
126 | typedef typename parser::node_ptr_vector node_ptr_vector; | |
127 | typedef std::set<const detail::node *> node_set; | |
128 | typedef detail::ptr_vector<node_set> node_set_vector; | |
129 | typedef std::vector<const detail::node *> node_vector; | |
130 | typedef detail::ptr_vector<node_vector> node_vector_vector; | |
131 | typedef typename parser::string string; | |
132 | typedef std::pair<string, string> string_pair; | |
133 | typedef typename parser::tokeniser::string_token string_token; | |
134 | typedef std::deque<string_pair> macro_deque; | |
135 | typedef std::pair<string, const detail::node *> macro_pair; | |
136 | typedef typename parser::macro_map::iterator macro_iter; | |
137 | typedef std::pair<macro_iter, bool> macro_iter_pair; | |
138 | typedef typename parser::tokeniser::token_map token_map; | |
139 | ||
140 | static detail::node *build_tree (const rules &rules_, | |
141 | const std::size_t state_, node_ptr_vector &node_ptr_vector_, | |
142 | detail::internals &internals_, index_set_vector &set_mapping_) | |
143 | { | |
144 | size_t_vector *lookup_ = internals_._lookup[state_]; | |
145 | const typename rules::string_deque_deque ®exes_ = | |
146 | rules_.regexes (); | |
147 | const typename rules::id_vector_deque &ids_ = rules_.ids (); | |
148 | const typename rules::id_vector_deque &unique_ids_ = | |
149 | rules_.unique_ids (); | |
150 | const typename rules::id_vector_deque &states_ = rules_.states (); | |
151 | typename rules::string_deque::const_iterator regex_iter_ = | |
152 | regexes_[state_].begin (); | |
153 | typename rules::string_deque::const_iterator regex_iter_end_ = | |
154 | regexes_[state_].end (); | |
155 | typename rules::id_vector::const_iterator ids_iter_ = | |
156 | ids_[state_].begin (); | |
157 | typename rules::id_vector::const_iterator unique_ids_iter_ = | |
158 | unique_ids_[state_].begin (); | |
159 | typename rules::id_vector::const_iterator states_iter_ = | |
160 | states_[state_].begin (); | |
161 | const typename rules::string ®ex_ = *regex_iter_; | |
162 | // map of regex charset tokens (strings) to index | |
163 | token_map token_map_; | |
164 | const typename rules::string_pair_deque ¯odeque_ = | |
165 | rules_.macrodeque (); | |
166 | typename parser::macro_map macromap_; | |
167 | typename detail::node::node_vector tree_vector_; | |
168 | ||
169 | build_macros (token_map_, macrodeque_, macromap_, | |
170 | rules_.flags (), rules_.locale (), node_ptr_vector_, | |
171 | internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); | |
172 | ||
173 | detail::node *root_ = parser::parse (regex_.c_str (), | |
174 | regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, | |
175 | *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, | |
176 | macromap_, token_map_, internals_._seen_BOL_assertion, | |
177 | internals_._seen_EOL_assertion); | |
178 | ||
179 | ++regex_iter_; | |
180 | ++ids_iter_; | |
181 | ++unique_ids_iter_; | |
182 | ++states_iter_; | |
183 | tree_vector_.push_back (root_); | |
184 | ||
185 | // build syntax trees | |
186 | while (regex_iter_ != regex_iter_end_) | |
187 | { | |
188 | // re-declare var, otherwise we perform an assignment..! | |
189 | const typename rules::string ®ex2_ = *regex_iter_; | |
190 | ||
191 | root_ = parser::parse (regex2_.c_str (), | |
192 | regex2_.c_str () + regex2_.size (), *ids_iter_, | |
193 | *unique_ids_iter_, *states_iter_, rules_.flags (), | |
194 | rules_.locale (), node_ptr_vector_, macromap_, token_map_, | |
195 | internals_._seen_BOL_assertion, | |
196 | internals_._seen_EOL_assertion); | |
197 | tree_vector_.push_back (root_); | |
198 | ++regex_iter_; | |
199 | ++ids_iter_; | |
200 | ++unique_ids_iter_; | |
201 | ++states_iter_; | |
202 | } | |
203 | ||
204 | if (internals_._seen_BOL_assertion) | |
205 | { | |
206 | // Fixup BOLs | |
207 | typename detail::node::node_vector::iterator iter_ = | |
208 | tree_vector_.begin (); | |
209 | typename detail::node::node_vector::iterator end_ = | |
210 | tree_vector_.end (); | |
211 | ||
212 | for (; iter_ != end_; ++iter_) | |
213 | { | |
214 | fixup_bol (*iter_, node_ptr_vector_); | |
215 | } | |
216 | } | |
217 | ||
218 | // join trees | |
219 | { | |
220 | typename detail::node::node_vector::iterator iter_ = | |
221 | tree_vector_.begin (); | |
222 | typename detail::node::node_vector::iterator end_ = | |
223 | tree_vector_.end (); | |
224 | ||
225 | if (iter_ != end_) | |
226 | { | |
227 | root_ = *iter_; | |
228 | ++iter_; | |
229 | } | |
230 | ||
231 | for (; iter_ != end_; ++iter_) | |
232 | { | |
233 | node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0)); | |
234 | node_ptr_vector_->back () = new detail::selection_node | |
235 | (root_, *iter_); | |
236 | root_ = node_ptr_vector_->back (); | |
237 | } | |
238 | } | |
239 | ||
240 | // partitioned token list | |
241 | charset_list token_list_; | |
242 | ||
243 | set_mapping_.resize (token_map_.size ()); | |
244 | partition_tokens (token_map_, token_list_); | |
245 | ||
246 | typename charset_list::list::const_iterator iter_ = | |
247 | token_list_->begin (); | |
248 | typename charset_list::list::const_iterator end_ = | |
249 | token_list_->end (); | |
250 | std::size_t index_ = 0; | |
251 | ||
252 | for (; iter_ != end_; ++iter_, ++index_) | |
253 | { | |
254 | const charset *cs_ = *iter_; | |
255 | typename charset::index_set::const_iterator set_iter_ = | |
256 | cs_->_index_set.begin (); | |
257 | typename charset::index_set::const_iterator set_end_ = | |
258 | cs_->_index_set.end (); | |
259 | ||
260 | fill_lookup (cs_->_token, lookup_, index_); | |
261 | ||
262 | for (; set_iter_ != set_end_; ++set_iter_) | |
263 | { | |
264 | set_mapping_[*set_iter_].insert (index_); | |
265 | } | |
266 | } | |
267 | ||
268 | internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset; | |
269 | return root_; | |
270 | } | |
271 | ||
272 | static void build_macros (token_map &token_map_, | |
273 | const macro_deque ¯odeque_, | |
274 | typename parser::macro_map ¯omap_, const regex_flags flags_, | |
275 | const std::locale &locale_, node_ptr_vector &node_ptr_vector_, | |
276 | bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) | |
277 | { | |
278 | for (typename macro_deque::const_iterator iter_ = | |
279 | macrodeque_.begin (), end_ = macrodeque_.end (); | |
280 | iter_ != end_; ++iter_) | |
281 | { | |
282 | const typename rules::string &name_ = iter_->first; | |
283 | const typename rules::string ®ex_ = iter_->second; | |
284 | detail::node *node_ = parser::parse (regex_.c_str (), | |
285 | regex_.c_str () + regex_.size (), 0, 0, 0, flags_, | |
286 | locale_, node_ptr_vector_, macromap_, token_map_, | |
287 | seen_BOL_assertion_, seen_EOL_assertion_); | |
288 | macro_iter_pair map_iter_ = macromap_. | |
289 | insert (macro_pair (name_, static_cast<const detail::node *> | |
290 | (0))); | |
291 | ||
292 | map_iter_.first->second = node_; | |
293 | } | |
294 | } | |
295 | ||
296 | static void build_dfa (detail::node *root_, | |
297 | const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, | |
298 | size_t_vector &dfa_) | |
299 | { | |
300 | typename detail::node::node_vector *followpos_ = | |
301 | &root_->firstpos (); | |
302 | node_set_vector seen_sets_; | |
303 | node_vector_vector seen_vectors_; | |
304 | size_t_vector hash_vector_; | |
305 | ||
306 | // 'jam' state | |
307 | dfa_.resize (dfa_alphabet_, 0); | |
308 | closure (followpos_, seen_sets_, seen_vectors_, | |
309 | hash_vector_, dfa_alphabet_, dfa_); | |
310 | ||
311 | std::size_t *ptr_ = 0; | |
312 | ||
313 | for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) | |
314 | { | |
315 | equivset_list equiv_list_; | |
316 | ||
317 | build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); | |
318 | ||
319 | for (typename equivset_list::list::const_iterator iter_ = | |
320 | equiv_list_->begin (), end_ = equiv_list_->end (); | |
321 | iter_ != end_; ++iter_) | |
322 | { | |
323 | equivset *equivset_ = *iter_; | |
324 | const std::size_t transition_ = closure | |
325 | (&equivset_->_followpos, seen_sets_, seen_vectors_, | |
326 | hash_vector_, dfa_alphabet_, dfa_); | |
327 | ||
328 | if (transition_ != npos) | |
329 | { | |
330 | ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); | |
331 | ||
332 | // Prune abstemious transitions from end states. | |
333 | if (*ptr_ && !equivset_->_greedy) continue; | |
334 | ||
335 | for (typename detail::equivset::index_vector::const_iterator | |
336 | equiv_iter_ = equivset_->_index_vector.begin (), | |
337 | equiv_end_ = equivset_->_index_vector.end (); | |
338 | equiv_iter_ != equiv_end_; ++equiv_iter_) | |
339 | { | |
340 | const std::size_t equiv_index_ = *equiv_iter_; | |
341 | ||
342 | if (equiv_index_ == bol_token) | |
343 | { | |
344 | if (ptr_[eol_index] == 0) | |
345 | { | |
346 | ptr_[bol_index] = transition_; | |
347 | } | |
348 | } | |
349 | else if (equiv_index_ == eol_token) | |
350 | { | |
351 | if (ptr_[bol_index] == 0) | |
352 | { | |
353 | ptr_[eol_index] = transition_; | |
354 | } | |
355 | } | |
356 | else | |
357 | { | |
358 | ptr_[equiv_index_ + dfa_offset] = transition_; | |
359 | } | |
360 | } | |
361 | } | |
362 | } | |
363 | } | |
364 | } | |
365 | ||
366 | static std::size_t closure (typename detail::node::node_vector *followpos_, | |
367 | node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, | |
368 | size_t_vector &hash_vector_, const std::size_t size_, | |
369 | size_t_vector &dfa_) | |
370 | { | |
371 | bool end_state_ = false; | |
372 | std::size_t id_ = 0; | |
373 | std::size_t unique_id_ = npos; | |
374 | std::size_t state_ = 0; | |
375 | std::size_t hash_ = 0; | |
376 | ||
377 | if (followpos_->empty ()) return npos; | |
378 | ||
379 | std::size_t index_ = 0; | |
380 | std::auto_ptr<node_set> set_ptr_ (new node_set); | |
381 | std::auto_ptr<node_vector> vector_ptr_ (new node_vector); | |
382 | ||
383 | for (typename detail::node::node_vector::const_iterator iter_ = | |
384 | followpos_->begin (), end_ = followpos_->end (); | |
385 | iter_ != end_; ++iter_) | |
386 | { | |
387 | closure_ex (*iter_, end_state_, id_, unique_id_, state_, | |
388 | set_ptr_.get (), vector_ptr_.get (), hash_); | |
389 | } | |
390 | ||
391 | bool found_ = false; | |
392 | typename size_t_vector::const_iterator hash_iter_ = | |
393 | hash_vector_.begin (); | |
394 | typename size_t_vector::const_iterator hash_end_ = | |
395 | hash_vector_.end (); | |
396 | typename node_set_vector::vector::const_iterator set_iter_ = | |
397 | seen_sets_->begin (); | |
398 | ||
399 | for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) | |
400 | { | |
401 | found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; | |
402 | ++index_; | |
403 | ||
404 | if (found_) break; | |
405 | } | |
406 | ||
407 | if (!found_) | |
408 | { | |
409 | seen_sets_->push_back (static_cast<node_set *>(0)); | |
410 | seen_sets_->back () = set_ptr_.release (); | |
411 | seen_vectors_->push_back (static_cast<node_vector *>(0)); | |
412 | seen_vectors_->back () = vector_ptr_.release (); | |
413 | hash_vector_.push_back (hash_); | |
414 | // State 0 is the jam state... | |
415 | index_ = seen_sets_->size (); | |
416 | ||
417 | const std::size_t old_size_ = dfa_.size (); | |
418 | ||
419 | dfa_.resize (old_size_ + size_, 0); | |
420 | ||
421 | if (end_state_) | |
422 | { | |
423 | dfa_[old_size_] |= end_state; | |
424 | dfa_[old_size_ + id_index] = id_; | |
425 | dfa_[old_size_ + unique_id_index] = unique_id_; | |
426 | dfa_[old_size_ + state_index] = state_; | |
427 | } | |
428 | } | |
429 | ||
430 | return index_; | |
431 | } | |
432 | ||
433 | static void closure_ex (detail::node *node_, bool &end_state_, | |
434 | std::size_t &id_, std::size_t &unique_id_, std::size_t &state_, | |
435 | node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) | |
436 | { | |
437 | const bool temp_end_state_ = node_->end_state (); | |
438 | ||
439 | if (temp_end_state_) | |
440 | { | |
441 | if (!end_state_) | |
442 | { | |
443 | end_state_ = true; | |
444 | id_ = node_->id (); | |
445 | unique_id_ = node_->unique_id (); | |
446 | state_ = node_->lexer_state (); | |
447 | } | |
448 | } | |
449 | ||
450 | if (set_ptr_->insert (node_).second) | |
451 | { | |
452 | vector_ptr_->push_back (node_); | |
453 | hash_ += reinterpret_cast<std::size_t> (node_); | |
454 | } | |
455 | } | |
456 | ||
457 | static void partition_tokens (const token_map &map_, | |
458 | charset_list &lhs_) | |
459 | { | |
460 | charset_list rhs_; | |
461 | ||
462 | fill_rhs_list (map_, rhs_); | |
463 | ||
464 | if (!rhs_->empty ()) | |
465 | { | |
466 | typename charset_list::list::iterator iter_; | |
467 | typename charset_list::list::iterator end_; | |
468 | charset_ptr overlap_ (new charset); | |
469 | ||
470 | lhs_->push_back (static_cast<charset *>(0)); | |
471 | lhs_->back () = rhs_->front (); | |
472 | rhs_->pop_front (); | |
473 | ||
474 | while (!rhs_->empty ()) | |
475 | { | |
476 | charset_ptr r_ (rhs_->front ()); | |
477 | ||
478 | rhs_->pop_front (); | |
479 | iter_ = lhs_->begin (); | |
480 | end_ = lhs_->end (); | |
481 | ||
482 | while (!r_->empty () && iter_ != end_) | |
483 | { | |
484 | typename charset_list::list::iterator l_iter_ = iter_; | |
485 | ||
486 | (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); | |
487 | ||
488 | if (overlap_->empty ()) | |
489 | { | |
490 | ++iter_; | |
491 | } | |
492 | else if ((*l_iter_)->empty ()) | |
493 | { | |
494 | delete *l_iter_; | |
495 | *l_iter_ = overlap_.release (); | |
496 | ||
497 | // VC++ 6 Hack: | |
498 | charset_ptr temp_overlap_ (new charset); | |
499 | ||
500 | overlap_ = temp_overlap_; | |
501 | ++iter_; | |
502 | } | |
503 | else if (r_->empty ()) | |
504 | { | |
505 | delete r_.release (); | |
506 | r_ = overlap_; | |
507 | ||
508 | // VC++ 6 Hack: | |
509 | charset_ptr temp_overlap_ (new charset); | |
510 | ||
511 | overlap_ = temp_overlap_; | |
512 | break; | |
513 | } | |
514 | else | |
515 | { | |
516 | iter_ = lhs_->insert (++iter_, | |
517 | static_cast<charset *>(0)); | |
518 | *iter_ = overlap_.release (); | |
519 | ||
520 | // VC++ 6 Hack: | |
521 | charset_ptr temp_overlap_ (new charset); | |
522 | ||
523 | overlap_ = temp_overlap_; | |
524 | ++iter_; | |
525 | end_ = lhs_->end (); | |
526 | } | |
527 | } | |
528 | ||
529 | if (!r_->empty ()) | |
530 | { | |
531 | lhs_->push_back (static_cast<charset *>(0)); | |
532 | lhs_->back () = r_.release (); | |
533 | } | |
534 | } | |
535 | } | |
536 | } | |
537 | ||
538 | static void fill_rhs_list (const token_map &map_, | |
539 | charset_list &list_) | |
540 | { | |
541 | typename parser::tokeniser::token_map::const_iterator iter_ = | |
542 | map_.begin (); | |
543 | typename parser::tokeniser::token_map::const_iterator end_ = | |
544 | map_.end (); | |
545 | ||
546 | for (; iter_ != end_; ++iter_) | |
547 | { | |
548 | list_->push_back (static_cast<charset *>(0)); | |
549 | list_->back () = new charset (iter_->first, iter_->second); | |
550 | } | |
551 | } | |
552 | ||
553 | static void fill_lookup (const string_token &token_, | |
554 | size_t_vector *lookup_, const std::size_t index_) | |
555 | { | |
556 | const CharT *curr_ = token_._charset.c_str (); | |
557 | const CharT *chars_end_ = curr_ + token_._charset.size (); | |
558 | std::size_t *ptr_ = &lookup_->front (); | |
559 | const std::size_t max_ = sizeof (CharT) == 1 ? | |
560 | num_chars : num_wchar_ts; | |
561 | ||
562 | if (token_._negated) | |
563 | { | |
564 | // $$$ FIXME JDG July 2014 $$$ | |
565 | // this code is problematic on platforms where wchar_t is signed | |
566 | // with min generating negative numbers. This crashes with BAD_ACCESS | |
567 | // because of the vector index below: | |
568 | // ptr_[static_cast<typename Traits::index_type>(curr_char_)] | |
569 | CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)(); | |
570 | std::size_t i_ = 0; | |
571 | ||
572 | while (curr_ < chars_end_) | |
573 | { | |
574 | while (*curr_ > curr_char_) | |
575 | { | |
576 | ptr_[static_cast<typename Traits::index_type> | |
577 | (curr_char_)] = index_ + dfa_offset; | |
578 | ++curr_char_; | |
579 | ++i_; | |
580 | } | |
581 | ||
582 | ++curr_char_; | |
583 | ++curr_; | |
584 | ++i_; | |
585 | } | |
586 | ||
587 | for (; i_ < max_; ++i_) | |
588 | { | |
589 | ptr_[static_cast<typename Traits::index_type>(curr_char_)] = | |
590 | index_ + dfa_offset; | |
591 | ++curr_char_; | |
592 | } | |
593 | } | |
594 | else | |
595 | { | |
596 | while (curr_ < chars_end_) | |
597 | { | |
598 | ptr_[static_cast<typename Traits::index_type>(*curr_)] = | |
599 | index_ + dfa_offset; | |
600 | ++curr_; | |
601 | } | |
602 | } | |
603 | } | |
604 | ||
605 | static void build_equiv_list (const node_vector *vector_, | |
606 | const index_set_vector &set_mapping_, equivset_list &lhs_) | |
607 | { | |
608 | equivset_list rhs_; | |
609 | ||
610 | fill_rhs_list (vector_, set_mapping_, rhs_); | |
611 | ||
612 | if (!rhs_->empty ()) | |
613 | { | |
614 | typename equivset_list::list::iterator iter_; | |
615 | typename equivset_list::list::iterator end_; | |
616 | equivset_ptr overlap_ (new equivset); | |
617 | ||
618 | lhs_->push_back (static_cast<equivset *>(0)); | |
619 | lhs_->back () = rhs_->front (); | |
620 | rhs_->pop_front (); | |
621 | ||
622 | while (!rhs_->empty ()) | |
623 | { | |
624 | equivset_ptr r_ (rhs_->front ()); | |
625 | ||
626 | rhs_->pop_front (); | |
627 | iter_ = lhs_->begin (); | |
628 | end_ = lhs_->end (); | |
629 | ||
630 | while (!r_->empty () && iter_ != end_) | |
631 | { | |
632 | typename equivset_list::list::iterator l_iter_ = iter_; | |
633 | ||
634 | (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); | |
635 | ||
636 | if (overlap_->empty ()) | |
637 | { | |
638 | ++iter_; | |
639 | } | |
640 | else if ((*l_iter_)->empty ()) | |
641 | { | |
642 | delete *l_iter_; | |
643 | *l_iter_ = overlap_.release (); | |
644 | ||
645 | // VC++ 6 Hack: | |
646 | equivset_ptr temp_overlap_ (new equivset); | |
647 | ||
648 | overlap_ = temp_overlap_; | |
649 | ++iter_; | |
650 | } | |
651 | else if (r_->empty ()) | |
652 | { | |
653 | delete r_.release (); | |
654 | r_ = overlap_; | |
655 | ||
656 | // VC++ 6 Hack: | |
657 | equivset_ptr temp_overlap_ (new equivset); | |
658 | ||
659 | overlap_ = temp_overlap_; | |
660 | break; | |
661 | } | |
662 | else | |
663 | { | |
664 | iter_ = lhs_->insert (++iter_, | |
665 | static_cast<equivset *>(0)); | |
666 | *iter_ = overlap_.release (); | |
667 | ||
668 | // VC++ 6 Hack: | |
669 | equivset_ptr temp_overlap_ (new equivset); | |
670 | ||
671 | overlap_ = temp_overlap_; | |
672 | ++iter_; | |
673 | end_ = lhs_->end (); | |
674 | } | |
675 | } | |
676 | ||
677 | if (!r_->empty ()) | |
678 | { | |
679 | lhs_->push_back (static_cast<equivset *>(0)); | |
680 | lhs_->back () = r_.release (); | |
681 | } | |
682 | } | |
683 | } | |
684 | } | |
685 | ||
686 | static void fill_rhs_list (const node_vector *vector_, | |
687 | const index_set_vector &set_mapping_, equivset_list &list_) | |
688 | { | |
689 | typename node_vector::const_iterator iter_ = | |
690 | vector_->begin (); | |
691 | typename node_vector::const_iterator end_ = | |
692 | vector_->end (); | |
693 | ||
694 | for (; iter_ != end_; ++iter_) | |
695 | { | |
696 | const detail::node *node_ = *iter_; | |
697 | ||
698 | if (!node_->end_state ()) | |
699 | { | |
700 | const std::size_t token_ = node_->token (); | |
701 | ||
702 | if (token_ != null_token) | |
703 | { | |
704 | list_->push_back (static_cast<equivset *>(0)); | |
705 | ||
706 | if (token_ == bol_token || token_ == eol_token) | |
707 | { | |
708 | std::set<std::size_t> index_set_; | |
709 | ||
710 | index_set_.insert (token_); | |
711 | list_->back () = new equivset (index_set_, | |
712 | node_->greedy (), token_, node_->followpos ()); | |
713 | } | |
714 | else | |
715 | { | |
716 | list_->back () = new equivset (set_mapping_[token_], | |
717 | node_->greedy (), token_, node_->followpos ()); | |
718 | } | |
719 | } | |
720 | } | |
721 | } | |
722 | } | |
723 | ||
724 | static void fixup_bol (detail::node * &root_, | |
725 | node_ptr_vector &node_ptr_vector_) | |
726 | { | |
727 | typename detail::node::node_vector *first_ = &root_->firstpos (); | |
728 | bool found_ = false; | |
729 | typename detail::node::node_vector::const_iterator iter_ = | |
730 | first_->begin (); | |
731 | typename detail::node::node_vector::const_iterator end_ = | |
732 | first_->end (); | |
733 | ||
734 | for (; iter_ != end_; ++iter_) | |
735 | { | |
736 | const detail::node *node_ = *iter_; | |
737 | ||
738 | found_ = !node_->end_state () && node_->token () == bol_token; | |
739 | ||
740 | if (found_) break; | |
741 | } | |
742 | ||
743 | if (!found_) | |
744 | { | |
745 | node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); | |
746 | node_ptr_vector_->back () = new detail::leaf_node | |
747 | (bol_token, true); | |
748 | ||
749 | detail::node *lhs_ = node_ptr_vector_->back (); | |
750 | ||
751 | node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0)); | |
752 | node_ptr_vector_->back () = new detail::leaf_node | |
753 | (null_token, true); | |
754 | ||
755 | detail::node *rhs_ = node_ptr_vector_->back (); | |
756 | ||
757 | node_ptr_vector_->push_back | |
758 | (static_cast<detail::selection_node *>(0)); | |
759 | node_ptr_vector_->back () = | |
760 | new detail::selection_node (lhs_, rhs_); | |
761 | lhs_ = node_ptr_vector_->back (); | |
762 | ||
763 | node_ptr_vector_->push_back | |
764 | (static_cast<detail::sequence_node *>(0)); | |
765 | node_ptr_vector_->back () = | |
766 | new detail::sequence_node (lhs_, root_); | |
767 | root_ = node_ptr_vector_->back (); | |
768 | } | |
769 | } | |
770 | ||
771 | static void minimise_dfa (const std::size_t dfa_alphabet_, | |
772 | size_t_vector &dfa_, std::size_t size_) | |
773 | { | |
774 | const std::size_t *first_ = &dfa_.front (); | |
775 | const std::size_t *second_ = 0; | |
776 | const std::size_t *end_ = first_ + size_; | |
777 | std::size_t index_ = 1; | |
778 | std::size_t new_index_ = 1; | |
779 | std::size_t curr_index_ = 0; | |
780 | index_set index_set_; | |
781 | size_t_vector lookup_; | |
782 | std::size_t *lookup_ptr_ = 0; | |
783 | ||
784 | lookup_.resize (size_ / dfa_alphabet_, null_token); | |
785 | lookup_ptr_ = &lookup_.front (); | |
786 | *lookup_ptr_ = 0; | |
787 | // Only one 'jam' state, so skip it. | |
788 | first_ += dfa_alphabet_; | |
789 | ||
790 | for (; first_ < end_; first_ += dfa_alphabet_, ++index_) | |
791 | { | |
792 | for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1; | |
793 | second_ < end_; second_ += dfa_alphabet_, ++curr_index_) | |
794 | { | |
795 | if (index_set_.find (curr_index_) != index_set_.end ()) | |
796 | { | |
797 | continue; | |
798 | } | |
799 | ||
800 | // Some systems have memcmp in namespace std. | |
801 | using namespace std; | |
802 | ||
803 | if (memcmp (first_, second_, sizeof (std::size_t) * | |
804 | dfa_alphabet_) == 0) | |
805 | { | |
806 | index_set_.insert (curr_index_); | |
807 | lookup_ptr_[curr_index_] = new_index_; | |
808 | } | |
809 | } | |
810 | ||
811 | if (lookup_ptr_[index_] == null_token) | |
812 | { | |
813 | lookup_ptr_[index_] = new_index_; | |
814 | ++new_index_; | |
815 | } | |
816 | } | |
817 | ||
818 | if (!index_set_.empty ()) | |
819 | { | |
820 | const std::size_t *front_ = &dfa_.front (); | |
821 | size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_); | |
822 | typename index_set::iterator set_end_ = | |
823 | index_set_.end (); | |
824 | const std::size_t *ptr_ = front_ + dfa_alphabet_; | |
825 | std::size_t *new_ptr_ = 0; | |
826 | ||
827 | new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); | |
828 | new_ptr_ = &new_dfa_.front () + dfa_alphabet_; | |
829 | size_ /= dfa_alphabet_; | |
830 | ||
831 | for (index_ = 1; index_ < size_; ++index_) | |
832 | { | |
833 | if (index_set_.find (index_) != set_end_) | |
834 | { | |
835 | ptr_ += dfa_alphabet_; | |
836 | continue; | |
837 | } | |
838 | ||
839 | new_ptr_[end_state_index] = ptr_[end_state_index]; | |
840 | new_ptr_[id_index] = ptr_[id_index]; | |
841 | new_ptr_[unique_id_index] = ptr_[unique_id_index]; | |
842 | new_ptr_[state_index] = ptr_[state_index]; | |
843 | new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]]; | |
844 | new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; | |
845 | new_ptr_ += dfa_offset; | |
846 | ptr_ += dfa_offset; | |
847 | ||
848 | for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_) | |
849 | { | |
850 | *new_ptr_++ = lookup_ptr_[*ptr_++]; | |
851 | } | |
852 | } | |
853 | ||
854 | dfa_.swap (new_dfa_); | |
855 | } | |
856 | } | |
857 | }; | |
858 | ||
859 | typedef basic_generator<char> generator; | |
860 | typedef basic_generator<wchar_t> wgenerator; | |
861 | } | |
862 | } | |
863 | ||
864 | #endif |