]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // generate_cpp.hpp |
2 | // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See accompanying | |
5 | // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) | |
6 | #ifndef BOOST_LEXER_GENERATE_CPP_HPP | |
7 | #define BOOST_LEXER_GENERATE_CPP_HPP | |
8 | ||
9 | #include "char_traits.hpp" | |
10 | #include "consts.hpp" | |
11 | #include "internals.hpp" | |
12 | #include <iostream> | |
7c673cae FG |
13 | #include "runtime_error.hpp" |
14 | #include "size_t.hpp" | |
15 | #include "state_machine.hpp" | |
16 | #include <vector> | |
17 | ||
18 | namespace boost | |
19 | { | |
20 | namespace lexer | |
21 | { | |
22 | template<typename CharT> | |
23 | void generate_cpp (const basic_state_machine<CharT> &state_machine_, | |
24 | std::ostream &os_, const bool use_pointers_ = false, | |
25 | const bool skip_unknown_ = true, const bool optimise_parameters_ = true, | |
26 | const char *name_ = "next_token") | |
27 | { | |
28 | const detail::internals &sm_ = state_machine_.data (); | |
29 | ||
30 | if (sm_._lookup->size () == 0) | |
31 | { | |
32 | throw runtime_error ("Cannot generate code from an empty " | |
33 | "state machine"); | |
34 | } | |
35 | ||
36 | std::string upper_name_ (__DATE__); | |
37 | const std::size_t lookups_ = sm_._lookup->front ()->size (); | |
38 | const std::size_t dfas_ = sm_._dfa->size (); | |
39 | std::string::size_type pos_ = upper_name_.find (' '); | |
40 | const char *iterator_ = 0; | |
41 | ||
42 | if (use_pointers_) | |
43 | { | |
44 | if (lookups_ == 256) | |
45 | { | |
46 | iterator_ = "const char *"; | |
47 | } | |
48 | else | |
49 | { | |
50 | iterator_ = "const wchar_t *"; | |
51 | } | |
52 | } | |
53 | else | |
54 | { | |
55 | iterator_ = "Iterator &"; | |
56 | } | |
57 | ||
58 | while (pos_ != std::string::npos) | |
59 | { | |
60 | upper_name_.replace (pos_, 1, "_"); | |
61 | pos_ = upper_name_.find (' ', pos_); | |
62 | } | |
63 | ||
64 | upper_name_ += '_'; | |
65 | upper_name_ += __TIME__; | |
66 | ||
67 | pos_ = upper_name_.find (':'); | |
68 | ||
69 | while (pos_ != std::string::npos) | |
70 | { | |
71 | upper_name_.erase (pos_, 1); | |
72 | pos_ = upper_name_.find (':', pos_); | |
73 | } | |
74 | ||
75 | upper_name_ = '_' + upper_name_; | |
76 | upper_name_ = name_ + upper_name_; | |
77 | std::transform (upper_name_.begin (), upper_name_.end (), | |
78 | upper_name_.begin (), ::toupper); | |
79 | os_ << "#ifndef " << upper_name_ + '\n'; | |
80 | os_ << "#define " << upper_name_ + '\n'; | |
81 | os_ << "// Copyright (c) 2008-2009 Ben Hanson\n"; | |
82 | os_ << "//\n"; | |
83 | os_ << "// Distributed under the Boost Software License, " | |
84 | "Version 1.0. (See accompanying\n"; | |
85 | os_ << "// file licence_1_0.txt or copy at " | |
86 | "http://www.boost.org/LICENSE_1_0.txt)\n\n"; | |
87 | os_ << "// Auto-generated by boost::lexer\n"; | |
88 | os_ << "template<typename Iterator>\n"; | |
89 | os_ << "std::size_t " << name_ << " ("; | |
90 | ||
91 | if (dfas_ > 1 || !optimise_parameters_) | |
92 | { | |
93 | os_ << "std::size_t &start_state_, "; | |
94 | } | |
95 | ||
96 | if (use_pointers_) | |
97 | { | |
98 | os_ << iterator_ << " &"; | |
99 | } | |
100 | else | |
101 | { | |
102 | os_ << iterator_; | |
103 | } | |
104 | ||
105 | os_ << "start_token_, "; | |
106 | ||
107 | if (use_pointers_) | |
108 | { | |
109 | os_ << iterator_ << " const "; | |
110 | } | |
111 | else | |
112 | { | |
113 | os_ << "const " << iterator_; | |
114 | } | |
115 | ||
116 | os_ << "end_, \n"; | |
117 | os_ << " std::size_t &unique_id_"; | |
118 | ||
119 | if (sm_._seen_BOL_assertion || !optimise_parameters_) | |
120 | { | |
121 | os_ << ", bool &beg_of_line_"; | |
122 | } | |
123 | ||
124 | os_ << ")\n"; | |
125 | os_ << "{\n"; | |
126 | os_ << " enum {end_state_index, id_index, unique_id_index, state_index, bol_index,\n"; | |
127 | os_ << " eol_index, dead_state_index, dfa_offset};\n"; | |
128 | os_ << " static const std::size_t npos = static_cast" | |
129 | "<std::size_t>(~0);\n"; | |
130 | ||
131 | if (dfas_ > 1) | |
132 | { | |
133 | std::size_t state_ = 0; | |
134 | ||
135 | for (; state_ < dfas_; ++state_) | |
136 | { | |
137 | std::size_t i_ = 0; | |
138 | std::size_t j_ = 1; | |
139 | std::size_t count_ = lookups_ / 8; | |
140 | const std::size_t *lookup_ = &sm_._lookup[state_]->front (); | |
141 | const std::size_t *dfa_ = &sm_._dfa[state_]->front (); | |
142 | ||
143 | os_ << " static const std::size_t lookup" << state_ << "_[" << | |
144 | lookups_ << "] = {"; | |
145 | ||
146 | for (; i_ < count_; ++i_) | |
147 | { | |
148 | const std::size_t index_ = i_ * 8; | |
149 | ||
150 | os_ << lookup_[index_]; | |
151 | ||
152 | for (; j_ < 8; ++j_) | |
153 | { | |
154 | os_ << ", " << lookup_[index_ + j_]; | |
155 | } | |
156 | ||
157 | if (i_ < count_ - 1) | |
158 | { | |
159 | os_ << "," << std::endl << " "; | |
160 | } | |
161 | ||
162 | j_ = 1; | |
163 | } | |
164 | ||
165 | os_ << "};\n"; | |
166 | count_ = sm_._dfa[state_]->size (); | |
167 | os_ << " static const std::size_t dfa" << state_ << "_[" << | |
168 | count_ << "] = {"; | |
169 | count_ /= 8; | |
170 | ||
171 | for (i_ = 0; i_ < count_; ++i_) | |
172 | { | |
173 | const std::size_t index_ = i_ * 8; | |
174 | ||
175 | os_ << dfa_[index_]; | |
176 | ||
177 | for (j_ = 1; j_ < 8; ++j_) | |
178 | { | |
179 | os_ << ", " << dfa_[index_ + j_]; | |
180 | } | |
181 | ||
182 | if (i_ < count_ - 1) | |
183 | { | |
184 | os_ << "," << std::endl << " "; | |
185 | } | |
186 | } | |
187 | ||
188 | const std::size_t mod_ = sm_._dfa[state_]->size () % 8; | |
189 | ||
190 | if (mod_) | |
191 | { | |
192 | const std::size_t index_ = count_ * 8; | |
193 | ||
194 | if (count_) | |
195 | { | |
196 | os_ << ",\n "; | |
197 | } | |
198 | ||
199 | os_ << dfa_[index_]; | |
200 | ||
201 | for (j_ = 1; j_ < mod_; ++j_) | |
202 | { | |
203 | os_ << ", " << dfa_[index_ + j_]; | |
204 | } | |
205 | } | |
206 | ||
207 | os_ << "};\n"; | |
208 | } | |
209 | ||
210 | std::size_t count_ = sm_._dfa_alphabet.size (); | |
211 | std::size_t i_ = 1; | |
212 | ||
213 | os_ << " static const std::size_t *lookup_arr_[" << count_ << | |
214 | "] = {"; | |
215 | os_ << "lookup0_"; | |
216 | ||
217 | for (i_ = 1; i_ < count_; ++i_) | |
218 | { | |
219 | os_ << ", " << "lookup" << i_ << "_"; | |
220 | } | |
221 | ||
222 | os_ << "};\n"; | |
223 | os_ << " static const std::size_t dfa_alphabet_arr_[" << count_ << | |
224 | "] = {"; | |
225 | os_ << sm_._dfa_alphabet.front (); | |
226 | ||
227 | for (i_ = 1; i_ < count_; ++i_) | |
228 | { | |
229 | os_ << ", " << sm_._dfa_alphabet[i_]; | |
230 | } | |
231 | ||
232 | os_ << "};\n"; | |
233 | os_ << " static const std::size_t *dfa_arr_[" << count_ << | |
234 | "] = {"; | |
235 | os_ << "dfa0_"; | |
236 | ||
237 | for (i_ = 1; i_ < count_; ++i_) | |
238 | { | |
239 | os_ << ", " << "dfa" << i_ << "_"; | |
240 | } | |
241 | ||
242 | os_ << "};\n"; | |
243 | } | |
244 | else | |
245 | { | |
246 | const std::size_t *lookup_ = &sm_._lookup->front ()->front (); | |
247 | const std::size_t *dfa_ = &sm_._dfa->front ()->front (); | |
248 | std::size_t i_ = 0; | |
249 | std::size_t j_ = 1; | |
250 | std::size_t count_ = lookups_ / 8; | |
251 | ||
252 | os_ << " static const std::size_t lookup_["; | |
253 | os_ << sm_._lookup->front ()->size () << "] = {"; | |
254 | ||
255 | for (; i_ < count_; ++i_) | |
256 | { | |
257 | const std::size_t index_ = i_ * 8; | |
258 | ||
259 | os_ << lookup_[index_]; | |
260 | ||
261 | for (; j_ < 8; ++j_) | |
262 | { | |
263 | os_ << ", " << lookup_[index_ + j_]; | |
264 | } | |
265 | ||
266 | if (i_ < count_ - 1) | |
267 | { | |
268 | os_ << "," << std::endl << " "; | |
269 | } | |
270 | ||
271 | j_ = 1; | |
272 | } | |
273 | ||
274 | os_ << "};\n"; | |
275 | os_ << " static const std::size_t dfa_alphabet_ = " << | |
276 | sm_._dfa_alphabet.front () << ";\n"; | |
277 | os_ << " static const std::size_t dfa_[" << | |
278 | sm_._dfa->front ()->size () << "] = {"; | |
279 | count_ = sm_._dfa->front ()->size () / 8; | |
280 | ||
281 | for (i_ = 0; i_ < count_; ++i_) | |
282 | { | |
283 | const std::size_t index_ = i_ * 8; | |
284 | ||
285 | os_ << dfa_[index_]; | |
286 | ||
287 | for (j_ = 1; j_ < 8; ++j_) | |
288 | { | |
289 | os_ << ", " << dfa_[index_ + j_]; | |
290 | } | |
291 | ||
292 | if (i_ < count_ - 1) | |
293 | { | |
294 | os_ << "," << std::endl << " "; | |
295 | } | |
296 | } | |
297 | ||
298 | const std::size_t mod_ = sm_._dfa->front ()->size () % 8; | |
299 | ||
300 | if (mod_) | |
301 | { | |
302 | const std::size_t index_ = count_ * 8; | |
303 | ||
304 | if (count_) | |
305 | { | |
306 | os_ << ",\n "; | |
307 | } | |
308 | ||
309 | os_ << dfa_[index_]; | |
310 | ||
311 | for (j_ = 1; j_ < mod_; ++j_) | |
312 | { | |
313 | os_ << ", " << dfa_[index_ + j_]; | |
314 | } | |
315 | } | |
316 | ||
317 | os_ << "};\n"; | |
318 | } | |
319 | ||
320 | os_ << "\n if (start_token_ == end_)\n"; | |
321 | os_ << " {\n"; | |
322 | os_ << " unique_id_ = npos;\n"; | |
323 | os_ << " return 0;\n"; | |
324 | os_ << " }\n\n"; | |
325 | ||
326 | if (dfas_ > 1) | |
327 | { | |
328 | os_ << "again:\n"; | |
329 | os_ << " const std::size_t * lookup_ = " | |
330 | "lookup_arr_[start_state_];\n"; | |
331 | os_ << " std::size_t dfa_alphabet_ = " | |
332 | "dfa_alphabet_arr_[start_state_];\n"; | |
333 | os_ << " const std::size_t *dfa_ = dfa_arr_[start_state_];\n"; | |
334 | } | |
335 | ||
336 | os_ << " const std::size_t *ptr_ = dfa_ + dfa_alphabet_;\n"; | |
337 | os_ << " Iterator curr_ = start_token_;\n"; | |
338 | os_ << " bool end_state_ = *ptr_ != 0;\n"; | |
339 | os_ << " std::size_t id_ = *(ptr_ + id_index);\n"; | |
340 | os_ << " std::size_t uid_ = *(ptr_ + unique_id_index);\n"; | |
341 | ||
342 | if (dfas_ > 1) | |
343 | { | |
344 | os_ << " std::size_t end_start_state_ = start_state_;\n"; | |
345 | } | |
346 | ||
347 | if (sm_._seen_BOL_assertion) | |
348 | { | |
349 | os_ << " bool bol_ = beg_of_line_;\n"; | |
350 | os_ << " bool end_bol_ = bol_;\n"; | |
351 | } | |
352 | ||
353 | os_ << " Iterator end_token_ = start_token_;\n"; | |
354 | os_ << '\n'; | |
355 | os_ << " while (curr_ != end_)\n"; | |
356 | os_ << " {\n"; | |
357 | ||
358 | if (sm_._seen_BOL_assertion) | |
359 | { | |
360 | os_ << " const std::size_t BOL_state_ = ptr_[bol_index];\n"; | |
361 | } | |
362 | ||
363 | if (sm_._seen_EOL_assertion) | |
364 | { | |
365 | os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n"; | |
366 | } | |
367 | ||
368 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
369 | { | |
370 | os_ << '\n'; | |
371 | } | |
372 | ||
373 | if (sm_._seen_BOL_assertion) | |
374 | { | |
375 | os_ << " if (BOL_state_ && bol_)\n"; | |
376 | os_ << " {\n"; | |
377 | os_ << " ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];\n"; | |
378 | os_ << " }\n"; | |
379 | } | |
380 | ||
381 | if (sm_._seen_EOL_assertion) | |
382 | { | |
383 | os_ << " "; | |
384 | ||
385 | if (sm_._seen_BOL_assertion) | |
386 | { | |
387 | os_ << "else "; | |
388 | } | |
389 | ||
390 | os_ << "if (EOL_state_ && *curr_ == '\\n')\n"; | |
391 | os_ << " {\n"; | |
392 | os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n"; | |
393 | os_ << " }\n"; | |
394 | } | |
395 | ||
396 | std::string tab_ (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion ? " " : ""); | |
397 | ||
398 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
399 | { | |
400 | os_ << " else\n"; | |
401 | os_ << " {\n"; | |
402 | } | |
403 | ||
404 | if (sm_._seen_BOL_assertion) | |
405 | { | |
406 | os_ << " "; | |
407 | ||
408 | if (lookups_ == 256) | |
409 | { | |
410 | os_ << "char"; | |
411 | } | |
412 | else | |
413 | { | |
414 | os_ << "wchar_t"; | |
415 | } | |
416 | ||
417 | os_ << " prev_char_ = *curr_++;\n\n"; | |
418 | os_ << " bol_ = prev_char_ == '\\n';\n\n"; | |
419 | } | |
420 | ||
421 | os_ << tab_; | |
422 | os_ << " const std::size_t state_ =\n"; | |
423 | os_ << tab_; | |
424 | os_ << " ptr_[lookup_["; | |
425 | ||
426 | if (lookups_ == 256) | |
427 | { | |
428 | os_ << "static_cast<unsigned char>("; | |
429 | } | |
430 | ||
431 | if (sm_._seen_BOL_assertion) | |
432 | { | |
433 | os_ << "prev_char"; | |
434 | } | |
435 | else | |
436 | { | |
437 | os_ << "*curr_++"; | |
438 | } | |
439 | ||
440 | ||
441 | if (lookups_ == 256) | |
442 | { | |
443 | os_ << ')'; | |
444 | } | |
445 | ||
446 | os_ << "]];\n\n"; | |
447 | ||
448 | os_ << tab_; | |
449 | os_ << " if (state_ == 0) break;\n\n"; | |
450 | os_ << tab_; | |
451 | os_ << " ptr_ = &dfa_[state_ * dfa_alphabet_];\n"; | |
452 | ||
453 | if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion) | |
454 | { | |
455 | os_ << " }\n"; | |
456 | } | |
457 | ||
458 | os_ << '\n'; | |
459 | os_ << " if (*ptr_)\n"; | |
460 | os_ << " {\n"; | |
461 | os_ << " end_state_ = true;\n"; | |
462 | os_ << " id_ = *(ptr_ + id_index);\n"; | |
463 | os_ << " uid_ = *(ptr_ + unique_id_index);\n"; | |
464 | ||
465 | if (dfas_ > 1) | |
466 | { | |
467 | os_ << " end_start_state_ = *(ptr_ + state_index);\n"; | |
468 | } | |
469 | ||
470 | if (sm_._seen_BOL_assertion) | |
471 | { | |
472 | os_ << " end_bol_ = bol_;\n"; | |
473 | } | |
474 | ||
475 | os_ << " end_token_ = curr_;\n"; | |
476 | os_ << " }\n"; | |
477 | os_ << " }\n"; | |
478 | os_ << '\n'; | |
479 | ||
480 | if (sm_._seen_EOL_assertion) | |
481 | { | |
482 | os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n"; | |
483 | os_ << '\n'; | |
484 | os_ << " if (EOL_state_ && curr_ == end_)\n"; | |
485 | os_ << " {\n"; | |
486 | os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n"; | |
487 | os_ << '\n'; | |
488 | os_ << " if (*ptr_)\n"; | |
489 | os_ << " {\n"; | |
490 | os_ << " end_state_ = true;\n"; | |
491 | os_ << " id_ = *(ptr_ + id_index);\n"; | |
492 | os_ << " uid_ = *(ptr_ + unique_id_index);\n"; | |
493 | ||
494 | if (dfas_ > 1) | |
495 | { | |
496 | os_ << " end_start_state_ = *(ptr_ + state_index);\n"; | |
497 | } | |
498 | ||
499 | if (sm_._seen_BOL_assertion) | |
500 | { | |
501 | os_ << " end_bol_ = bol_;\n"; | |
502 | } | |
503 | ||
504 | os_ << " end_token_ = curr_;\n"; | |
505 | os_ << " }\n"; | |
506 | os_ << " }\n"; | |
507 | os_ << '\n'; | |
508 | } | |
509 | ||
510 | os_ << " if (end_state_)\n"; | |
511 | os_ << " {\n"; | |
512 | os_ << " // return longest match\n"; | |
513 | ||
514 | if (dfas_ > 1) | |
515 | { | |
516 | os_ << " start_state_ = end_start_state_;\n"; | |
517 | } | |
518 | ||
519 | if (sm_._seen_BOL_assertion && dfas_ < 2) | |
520 | { | |
521 | os_ << " beg_of_line_ = end_bol_;\n"; | |
522 | } | |
523 | ||
524 | os_ << " start_token_ = end_token_;\n"; | |
525 | ||
526 | if (dfas_ > 1) | |
527 | { | |
528 | os_ << '\n'; | |
529 | os_ << " if (id_ == 0)\n"; | |
530 | os_ << " {\n"; | |
531 | ||
532 | if (sm_._seen_BOL_assertion) | |
533 | { | |
534 | os_ << " bol_ = end_bol_;\n"; | |
535 | } | |
536 | ||
537 | os_ << " goto again;\n"; | |
538 | os_ << " }\n"; | |
539 | ||
540 | if (sm_._seen_BOL_assertion) | |
541 | { | |
542 | os_ << " else\n"; | |
543 | os_ << " {\n"; | |
544 | os_ << " beg_of_line_ = end_bol_;\n"; | |
545 | os_ << " }\n"; | |
546 | } | |
547 | } | |
548 | ||
549 | os_ << " }\n"; | |
550 | os_ << " else\n"; | |
551 | os_ << " {\n"; | |
552 | ||
553 | if (sm_._seen_BOL_assertion) | |
554 | { | |
555 | os_ << " beg_of_line_ = *start_token_ == '\\n';\n"; | |
556 | } | |
557 | ||
558 | if (skip_unknown_) | |
559 | { | |
560 | os_ << " // No match causes char to be skipped\n"; | |
561 | os_ << " ++start_token_;\n"; | |
562 | } | |
563 | ||
564 | os_ << " id_ = npos;\n"; | |
565 | os_ << " uid_ = npos;\n"; | |
566 | os_ << " }\n"; | |
567 | os_ << '\n'; | |
568 | os_ << " unique_id_ = uid_;\n"; | |
569 | os_ << " return id_;\n"; | |
570 | os_ << "}\n"; | |
571 | os_ << "\n#endif\n"; | |
572 | } | |
573 | } | |
574 | } | |
575 | ||
576 | #endif |