]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // generate_cpp.hpp |
2 | // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_GENERATE_CPP_HPP | |
7 | #define BOOST_LEXER_GENERATE_CPP_HPP | |
8 | ||
9 | #include "char_traits.hpp" | |
10 | #include "consts.hpp" | |
11 | #include "internals.hpp" | |
12 | #include <iostream> | |
13 | #include <boost/detail/iterator.hpp> | |
14 | #include "runtime_error.hpp" | |
15 | #include "size_t.hpp" | |
16 | #include "state_machine.hpp" | |
17 | #include <vector> | |
18 | ||
19 | namespace boost | |
20 | { | |
21 | namespace lexer | |
22 | { | |
23 | template<typename CharT> | |
24 | void generate_cpp (const basic_state_machine<CharT> &state_machine_, | |
25 | std::ostream &os_, const bool use_pointers_ = false, | |
26 | const bool skip_unknown_ = true, const bool optimise_parameters_ = true, | |
27 | const char *name_ = "next_token") | |
28 | { | |
29 | const detail::internals &sm_ = state_machine_.data (); | |
30 | ||
31 | if (sm_._lookup->size () == 0) | |
32 | { | |
33 | throw runtime_error ("Cannot generate code from an empty " | |
34 | "state machine"); | |
35 | } | |
36 | ||
37 | std::string upper_name_ (__DATE__); | |
38 | const std::size_t lookups_ = sm_._lookup->front ()->size (); | |
39 | const std::size_t dfas_ = sm_._dfa->size (); | |
40 | std::string::size_type pos_ = upper_name_.find (' '); | |
41 | const char *iterator_ = 0; | |
42 | ||
43 | if (use_pointers_) | |
44 | { | |
45 | if (lookups_ == 256) | |
46 | { | |
47 | iterator_ = "const char *"; | |
48 | } | |
49 | else | |
50 | { | |
51 | iterator_ = "const wchar_t *"; | |
52 | } | |
53 | } | |
54 | else | |
55 | { | |
56 | iterator_ = "Iterator &"; | |
57 | } | |
58 | ||
59 | while (pos_ != std::string::npos) | |
60 | { | |
61 | upper_name_.replace (pos_, 1, "_"); | |
62 | pos_ = upper_name_.find (' ', pos_); | |
63 | } | |
64 | ||
65 | upper_name_ += '_'; | |
66 | upper_name_ += __TIME__; | |
67 | ||
68 | pos_ = upper_name_.find (':'); | |
69 | ||
70 | while (pos_ != std::string::npos) | |
71 | { | |
72 | upper_name_.erase (pos_, 1); | |
73 | pos_ = upper_name_.find (':', pos_); | |
74 | } | |
75 | ||
76 | upper_name_ = '_' + upper_name_; | |
77 | upper_name_ = name_ + upper_name_; | |
78 | std::transform (upper_name_.begin (), upper_name_.end (), | |
79 | upper_name_.begin (), ::toupper); | |
80 | os_ << "#ifndef " << upper_name_ + '\n'; | |
81 | os_ << "#define " << upper_name_ + '\n'; | |
82 | os_ << "// Copyright (c) 2008-2009 Ben Hanson\n"; | |
83 | os_ << "//\n"; | |
84 | os_ << "// Distributed under the Boost Software License, " | |
85 | "Version 1.0. (See accompanying\n"; | |
86 | os_ << "// file licence_1_0.txt or copy at " | |
87 | "http://www.boost.org/LICENSE_1_0.txt)\n\n"; | |
88 | os_ << "// Auto-generated by boost::lexer\n"; | |
89 | os_ << "template<typename Iterator>\n"; | |
90 | os_ << "std::size_t " << name_ << " ("; | |
91 | ||
92 | if (dfas_ > 1 || !optimise_parameters_) | |
93 | { | |
94 | os_ << "std::size_t &start_state_, "; | |
95 | } | |
96 | ||
97 | if (use_pointers_) | |
98 | { | |
99 | os_ << iterator_ << " &"; | |
100 | } | |
101 | else | |
102 | { | |
103 | os_ << iterator_; | |
104 | } | |
105 | ||
106 | os_ << "start_token_, "; | |
107 | ||
108 | if (use_pointers_) | |
109 | { | |
110 | os_ << iterator_ << " const "; | |
111 | } | |
112 | else | |
113 | { | |
114 | os_ << "const " << iterator_; | |
115 | } | |
116 | ||
117 | os_ << "end_, \n"; | |
118 | os_ << " std::size_t &unique_id_"; | |
119 | ||
120 | if (sm_._seen_BOL_assertion || !optimise_parameters_) | |
121 | { | |
122 | os_ << ", bool &beg_of_line_"; | |
123 | } | |
124 | ||
125 | os_ << ")\n"; | |
126 | os_ << "{\n"; | |
127 | os_ << " enum {end_state_index, id_index, unique_id_index, state_index, bol_index,\n"; | |
128 | os_ << " eol_index, dead_state_index, dfa_offset};\n"; | |
129 | os_ << " static const std::size_t npos = static_cast" | |
130 | "<std::size_t>(~0);\n"; | |
131 | ||
132 | if (dfas_ > 1) | |
133 | { | |
134 | std::size_t state_ = 0; | |
135 | ||
136 | for (; state_ < dfas_; ++state_) | |
137 | { | |
138 | std::size_t i_ = 0; | |
139 | std::size_t j_ = 1; | |
140 | std::size_t count_ = lookups_ / 8; | |
141 | const std::size_t *lookup_ = &sm_._lookup[state_]->front (); | |
142 | const std::size_t *dfa_ = &sm_._dfa[state_]->front (); | |
143 | ||
144 | os_ << " static const std::size_t lookup" << state_ << "_[" << | |
145 | lookups_ << "] = {"; | |
146 | ||
147 | for (; i_ < count_; ++i_) | |
148 | { | |
149 | const std::size_t index_ = i_ * 8; | |
150 | ||
151 | os_ << lookup_[index_]; | |
152 | ||
153 | for (; j_ < 8; ++j_) | |
154 | { | |
155 | os_ << ", " << lookup_[index_ + j_]; | |
156 | } | |
157 | ||
158 | if (i_ < count_ - 1) | |
159 | { | |
160 | os_ << "," << std::endl << " "; | |
161 | } | |
162 | ||
163 | j_ = 1; | |
164 | } | |
165 | ||
166 | os_ << "};\n"; | |
167 | count_ = sm_._dfa[state_]->size (); | |
168 | os_ << " static const std::size_t dfa" << state_ << "_[" << | |
169 | count_ << "] = {"; | |
170 | count_ /= 8; | |
171 | ||
172 | for (i_ = 0; i_ < count_; ++i_) | |
173 | { | |
174 | const std::size_t index_ = i_ * 8; | |
175 | ||
176 | os_ << dfa_[index_]; | |
177 | ||
178 | for (j_ = 1; j_ < 8; ++j_) | |
179 | { | |
180 | os_ << ", " << dfa_[index_ + j_]; | |
181 | } | |
182 | ||
183 | if (i_ < count_ - 1) | |
184 | { | |
185 | os_ << "," << std::endl << " "; | |
186 | } | |
187 | } | |
188 | ||
189 | const std::size_t mod_ = sm_._dfa[state_]->size () % 8; | |
190 | ||
191 | if (mod_) | |
192 | { | |
193 | const std::size_t index_ = count_ * 8; | |
194 | ||
195 | if (count_) | |
196 | { | |
197 | os_ << ",\n "; | |
198 | } | |
199 | ||
200 | os_ << dfa_[index_]; | |
201 | ||
202 | for (j_ = 1; j_ < mod_; ++j_) | |
203 | { | |
204 | os_ << ", " << dfa_[index_ + j_]; | |
205 | } | |
206 | } | |
207 | ||
208 | os_ << "};\n"; | |
209 | } | |
210 | ||
211 | std::size_t count_ = sm_._dfa_alphabet.size (); | |
212 | std::size_t i_ = 1; | |
213 | ||
214 | os_ << " static const std::size_t *lookup_arr_[" << count_ << | |
215 | "] = {"; | |
216 | os_ << "lookup0_"; | |
217 | ||
218 | for (i_ = 1; i_ < count_; ++i_) | |
219 | { | |
220 | os_ << ", " << "lookup" << i_ << "_"; | |
221 | } | |
222 | ||
223 | os_ << "};\n"; | |
224 | os_ << " static const std::size_t dfa_alphabet_arr_[" << count_ << | |
225 | "] = {"; | |
226 | os_ << sm_._dfa_alphabet.front (); | |
227 | ||
228 | for (i_ = 1; i_ < count_; ++i_) | |
229 | { | |
230 | os_ << ", " << sm_._dfa_alphabet[i_]; | |
231 | } | |
232 | ||
233 | os_ << "};\n"; | |
234 | os_ << " static const std::size_t *dfa_arr_[" << count_ << | |
235 | "] = {"; | |
236 | os_ << "dfa0_"; | |
237 | ||
238 | for (i_ = 1; i_ < count_; ++i_) | |
239 | { | |
240 | os_ << ", " << "dfa" << i_ << "_"; | |
241 | } | |
242 | ||
243 | os_ << "};\n"; | |
244 | } | |
245 | else | |
246 | { | |
247 | const std::size_t *lookup_ = &sm_._lookup->front ()->front (); | |
248 | const std::size_t *dfa_ = &sm_._dfa->front ()->front (); | |
249 | std::size_t i_ = 0; | |
250 | std::size_t j_ = 1; | |
251 | std::size_t count_ = lookups_ / 8; | |
252 | ||
253 | os_ << " static const std::size_t lookup_["; | |
254 | os_ << sm_._lookup->front ()->size () << "] = {"; | |
255 | ||
256 | for (; i_ < count_; ++i_) | |
257 | { | |
258 | const std::size_t index_ = i_ * 8; | |
259 | ||
260 | os_ << lookup_[index_]; | |
261 | ||
262 | for (; j_ < 8; ++j_) | |
263 | { | |
264 | os_ << ", " << lookup_[index_ + j_]; | |
265 | } | |
266 | ||
267 | if (i_ < count_ - 1) | |
268 | { | |
269 | os_ << "," << std::endl << " "; | |
270 | } | |
271 | ||
272 | j_ = 1; | |
273 | } | |
274 | ||
275 | os_ << "};\n"; | |
276 | os_ << " static const std::size_t dfa_alphabet_ = " << | |
277 | sm_._dfa_alphabet.front () << ";\n"; | |
278 | os_ << " static const std::size_t dfa_[" << | |
279 | sm_._dfa->front ()->size () << "] = {"; | |
280 | count_ = sm_._dfa->front ()->size () / 8; | |
281 | ||
282 | for (i_ = 0; i_ < count_; ++i_) | |
283 | { | |
284 | const std::size_t index_ = i_ * 8; | |
285 | ||
286 | os_ << dfa_[index_]; | |
287 | ||
288 | for (j_ = 1; j_ < 8; ++j_) | |
289 | { | |
290 | os_ << ", " << dfa_[index_ + j_]; | |
291 | } | |
292 | ||
293 | if (i_ < count_ - 1) | |
294 | { | |
295 | os_ << "," << std::endl << " "; | |
296 | } | |
297 | } | |
298 | ||
299 | const std::size_t mod_ = sm_._dfa->front ()->size () % 8; | |
300 | ||
301 | if (mod_) | |
302 | { | |
303 | const std::size_t index_ = count_ * 8; | |
304 | ||
305 | if (count_) | |
306 | { | |
307 | os_ << ",\n "; | |
308 | } | |
309 | ||
310 | os_ << dfa_[index_]; | |
311 | ||
312 | for (j_ = 1; j_ < mod_; ++j_) | |
313 | { | |
314 | os_ << ", " << dfa_[index_ + j_]; | |
315 | } | |
316 | } | |
317 | ||
318 | os_ << "};\n"; | |
319 | } | |
320 | ||
321 | os_ << "\n if (start_token_ == end_)\n"; | |
322 | os_ << " {\n"; | |
323 | os_ << " unique_id_ = npos;\n"; | |
324 | os_ << " return 0;\n"; | |
325 | os_ << " }\n\n"; | |
326 | ||
327 | if (dfas_ > 1) | |
328 | { | |
329 | os_ << "again:\n"; | |
330 | os_ << " const std::size_t * lookup_ = " | |
331 | "lookup_arr_[start_state_];\n"; | |
332 | os_ << " std::size_t dfa_alphabet_ = " | |
333 | "dfa_alphabet_arr_[start_state_];\n"; | |
334 | os_ << " const std::size_t *dfa_ = dfa_arr_[start_state_];\n"; | |
335 | } | |
336 | ||
337 | os_ << " const std::size_t *ptr_ = dfa_ + dfa_alphabet_;\n"; | |
338 | os_ << " Iterator curr_ = start_token_;\n"; | |
339 | os_ << " bool end_state_ = *ptr_ != 0;\n"; | |
340 | os_ << " std::size_t id_ = *(ptr_ + id_index);\n"; | |
341 | os_ << " std::size_t uid_ = *(ptr_ + unique_id_index);\n"; | |
342 | ||
343 | if (dfas_ > 1) | |
344 | { | |
345 | os_ << " std::size_t end_start_state_ = start_state_;\n"; | |
346 | } | |
347 | ||
348 | if (sm_._seen_BOL_assertion) | |
349 | { | |
350 | os_ << " bool bol_ = beg_of_line_;\n"; | |
351 | os_ << " bool end_bol_ = bol_;\n"; | |
352 | } | |
353 | ||
354 | os_ << " Iterator end_token_ = start_token_;\n"; | |
355 | os_ << '\n'; | |
356 | os_ << " while (curr_ != end_)\n"; | |
357 | os_ << " {\n"; | |
358 | ||
359 | if (sm_._seen_BOL_assertion) | |
360 | { | |
361 | os_ << " const std::size_t BOL_state_ = ptr_[bol_index];\n"; | |
362 | } | |
363 | ||
364 | if (sm_._seen_EOL_assertion) | |
365 | { | |
366 | os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n"; | |
367 | } | |
368 | ||
369 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
370 | { | |
371 | os_ << '\n'; | |
372 | } | |
373 | ||
374 | if (sm_._seen_BOL_assertion) | |
375 | { | |
376 | os_ << " if (BOL_state_ && bol_)\n"; | |
377 | os_ << " {\n"; | |
378 | os_ << " ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];\n"; | |
379 | os_ << " }\n"; | |
380 | } | |
381 | ||
382 | if (sm_._seen_EOL_assertion) | |
383 | { | |
384 | os_ << " "; | |
385 | ||
386 | if (sm_._seen_BOL_assertion) | |
387 | { | |
388 | os_ << "else "; | |
389 | } | |
390 | ||
391 | os_ << "if (EOL_state_ && *curr_ == '\\n')\n"; | |
392 | os_ << " {\n"; | |
393 | os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n"; | |
394 | os_ << " }\n"; | |
395 | } | |
396 | ||
397 | std::string tab_ (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion ? " " : ""); | |
398 | ||
399 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
400 | { | |
401 | os_ << " else\n"; | |
402 | os_ << " {\n"; | |
403 | } | |
404 | ||
405 | if (sm_._seen_BOL_assertion) | |
406 | { | |
407 | os_ << " "; | |
408 | ||
409 | if (lookups_ == 256) | |
410 | { | |
411 | os_ << "char"; | |
412 | } | |
413 | else | |
414 | { | |
415 | os_ << "wchar_t"; | |
416 | } | |
417 | ||
418 | os_ << " prev_char_ = *curr_++;\n\n"; | |
419 | os_ << " bol_ = prev_char_ == '\\n';\n\n"; | |
420 | } | |
421 | ||
422 | os_ << tab_; | |
423 | os_ << " const std::size_t state_ =\n"; | |
424 | os_ << tab_; | |
425 | os_ << " ptr_[lookup_["; | |
426 | ||
427 | if (lookups_ == 256) | |
428 | { | |
429 | os_ << "static_cast<unsigned char>("; | |
430 | } | |
431 | ||
432 | if (sm_._seen_BOL_assertion) | |
433 | { | |
434 | os_ << "prev_char"; | |
435 | } | |
436 | else | |
437 | { | |
438 | os_ << "*curr_++"; | |
439 | } | |
440 | ||
441 | ||
442 | if (lookups_ == 256) | |
443 | { | |
444 | os_ << ')'; | |
445 | } | |
446 | ||
447 | os_ << "]];\n\n"; | |
448 | ||
449 | os_ << tab_; | |
450 | os_ << " if (state_ == 0) break;\n\n"; | |
451 | os_ << tab_; | |
452 | os_ << " ptr_ = &dfa_[state_ * dfa_alphabet_];\n"; | |
453 | ||
454 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
455 | { | |
456 | os_ << " }\n"; | |
457 | } | |
458 | ||
459 | os_ << '\n'; | |
460 | os_ << " if (*ptr_)\n"; | |
461 | os_ << " {\n"; | |
462 | os_ << " end_state_ = true;\n"; | |
463 | os_ << " id_ = *(ptr_ + id_index);\n"; | |
464 | os_ << " uid_ = *(ptr_ + unique_id_index);\n"; | |
465 | ||
466 | if (dfas_ > 1) | |
467 | { | |
468 | os_ << " end_start_state_ = *(ptr_ + state_index);\n"; | |
469 | } | |
470 | ||
471 | if (sm_._seen_BOL_assertion) | |
472 | { | |
473 | os_ << " end_bol_ = bol_;\n"; | |
474 | } | |
475 | ||
476 | os_ << " end_token_ = curr_;\n"; | |
477 | os_ << " }\n"; | |
478 | os_ << " }\n"; | |
479 | os_ << '\n'; | |
480 | ||
481 | if (sm_._seen_EOL_assertion) | |
482 | { | |
483 | os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n"; | |
484 | os_ << '\n'; | |
485 | os_ << " if (EOL_state_ && curr_ == end_)\n"; | |
486 | os_ << " {\n"; | |
487 | os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n"; | |
488 | os_ << '\n'; | |
489 | os_ << " if (*ptr_)\n"; | |
490 | os_ << " {\n"; | |
491 | os_ << " end_state_ = true;\n"; | |
492 | os_ << " id_ = *(ptr_ + id_index);\n"; | |
493 | os_ << " uid_ = *(ptr_ + unique_id_index);\n"; | |
494 | ||
495 | if (dfas_ > 1) | |
496 | { | |
497 | os_ << " end_start_state_ = *(ptr_ + state_index);\n"; | |
498 | } | |
499 | ||
500 | if (sm_._seen_BOL_assertion) | |
501 | { | |
502 | os_ << " end_bol_ = bol_;\n"; | |
503 | } | |
504 | ||
505 | os_ << " end_token_ = curr_;\n"; | |
506 | os_ << " }\n"; | |
507 | os_ << " }\n"; | |
508 | os_ << '\n'; | |
509 | } | |
510 | ||
511 | os_ << " if (end_state_)\n"; | |
512 | os_ << " {\n"; | |
513 | os_ << " // return longest match\n"; | |
514 | ||
515 | if (dfas_ > 1) | |
516 | { | |
517 | os_ << " start_state_ = end_start_state_;\n"; | |
518 | } | |
519 | ||
520 | if (sm_._seen_BOL_assertion && dfas_ < 2) | |
521 | { | |
522 | os_ << " beg_of_line_ = end_bol_;\n"; | |
523 | } | |
524 | ||
525 | os_ << " start_token_ = end_token_;\n"; | |
526 | ||
527 | if (dfas_ > 1) | |
528 | { | |
529 | os_ << '\n'; | |
530 | os_ << " if (id_ == 0)\n"; | |
531 | os_ << " {\n"; | |
532 | ||
533 | if (sm_._seen_BOL_assertion) | |
534 | { | |
535 | os_ << " bol_ = end_bol_;\n"; | |
536 | } | |
537 | ||
538 | os_ << " goto again;\n"; | |
539 | os_ << " }\n"; | |
540 | ||
541 | if (sm_._seen_BOL_assertion) | |
542 | { | |
543 | os_ << " else\n"; | |
544 | os_ << " {\n"; | |
545 | os_ << " beg_of_line_ = end_bol_;\n"; | |
546 | os_ << " }\n"; | |
547 | } | |
548 | } | |
549 | ||
550 | os_ << " }\n"; | |
551 | os_ << " else\n"; | |
552 | os_ << " {\n"; | |
553 | ||
554 | if (sm_._seen_BOL_assertion) | |
555 | { | |
556 | os_ << " beg_of_line_ = *start_token_ == '\\n';\n"; | |
557 | } | |
558 | ||
559 | if (skip_unknown_) | |
560 | { | |
561 | os_ << " // No match causes char to be skipped\n"; | |
562 | os_ << " ++start_token_;\n"; | |
563 | } | |
564 | ||
565 | os_ << " id_ = npos;\n"; | |
566 | os_ << " uid_ = npos;\n"; | |
567 | os_ << " }\n"; | |
568 | os_ << '\n'; | |
569 | os_ << " unique_id_ = uid_;\n"; | |
570 | os_ << " return id_;\n"; | |
571 | os_ << "}\n"; | |
572 | os_ << "\n#endif\n"; | |
573 | } | |
574 | } | |
575 | } | |
576 | ||
577 | #endif |