]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/spirit/include/boost/spirit/home/support/char_encoding/unicode/create_tables.cpp
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / spirit / include / boost / spirit / home / support / char_encoding / unicode / create_tables.cpp
1 /*=============================================================================
2 Copyright (c) 2001-2011 Joel de Guzman
3
4 Distributed under the Boost Software License, Version 1.0. (See accompanying
5 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 =============================================================================*/
7 #include <boost/config/warning_disable.hpp>
8 #include <boost/spirit/include/qi.hpp>
9 #include <boost/spirit/include/phoenix.hpp>
10 #include <boost/unordered_map.hpp>
11 #include <boost/algorithm/string/trim.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/foreach.hpp>
14 #include <boost/array.hpp>
15 #include <boost/scoped_array.hpp>
16 #include <boost/range/iterator_range.hpp>
17
18 #include <iostream>
19 #include <iomanip>
20 #include <fstream>
21 #include <vector>
22 #include <algorithm>
23 #include <string>
24 #include <map>
25
26 // We place the data here. Each line comprises various fields
27 typedef std::vector<std::string> ucd_line;
28 typedef std::vector<ucd_line> ucd_vector;
29 typedef std::vector<ucd_line>::iterator ucd_iterator;
30
31 // spirit and phoenix using declarations
32 using boost::spirit::qi::parse;
33 using boost::spirit::qi::hex;
34 using boost::spirit::qi::char_;
35 using boost::spirit::qi::eol;
36 using boost::spirit::qi::rule;
37 using boost::spirit::qi::omit;
38 using boost::spirit::qi::_1;
39 using boost::spirit::qi::_val;
40 using boost::phoenix::push_back;
41 using boost::phoenix::ref;
42
43 // basic unsigned types
44 using boost::uint8_t;
45 using boost::uint16_t;
46 using boost::uint32_t;
47
48 // a char range
49 struct ucd_range
50 {
51 ucd_range(uint32_t start, uint32_t finish)
52 : start(start), finish(finish) {}
53
54 // we need this so we can use ucd_range as a multimap key
55 friend bool operator<(ucd_range const& a, ucd_range const& b)
56 {
57 return a.start < b.start;
58 }
59
60 uint32_t start;
61 uint32_t finish;
62 };
63
64 class ucd_info
65 {
66 public:
67
68 ucd_info(char const* filename)
69 {
70 std::ifstream in(filename, std::ios_base::in);
71 if (!in)
72 {
73 std::cerr << "Error: Could not open input file: "
74 << filename << std::endl;
75 }
76 else
77 {
78 std::string data; // We will read the contents here.
79 in.unsetf(std::ios::skipws); // No white space skipping!
80 std::copy(
81 std::istream_iterator<char>(in),
82 std::istream_iterator<char>(),
83 std::back_inserter(data));
84
85 typedef std::string::const_iterator iterator_type;
86 iterator_type f = data.begin();
87 iterator_type l = data.end();
88
89 rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
90 rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
91 rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
92 rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
93
94 parse(f, l, file, info);
95 }
96 }
97
98 template <typename Array>
99 void collect(Array& data, int field, bool collect_properties = true) const
100 {
101 BOOST_ASSERT(!info.empty());
102 ucd_vector::const_iterator current = info.begin();
103 ucd_vector::const_iterator end = info.end();
104
105 while (current != end)
106 {
107 std::string range = (*current)[0];
108 boost::trim(range);
109
110 std::string::const_iterator f = range.begin();
111 std::string::const_iterator l = range.end();
112
113 // get the code-point range
114 uint32_t start;
115 uint32_t finish;
116 parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
117
118 // special case for UnicodeData.txt ranges:
119 if ((*current)[1].find("First>") != std::string::npos)
120 {
121 ++current;
122 BOOST_ASSERT(current != end);
123 BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
124
125 std::string range = (*current)[0];
126 boost::trim(range);
127 f = range.begin();
128 l = range.end();
129
130 parse(f, l, hex[ref(finish) = _1]);
131 }
132
133 std::string code;
134 if (field < int(current->size()))
135 code = (*current)[field];
136 boost::trim(code);
137 // Only collect properties we are interested in
138 if (collect_properties) // code for properties
139 {
140 if (!ignore_property(code))
141 {
142 for (uint32_t i = start; i <= finish; ++i)
143 data[i] |= map_property(code);
144 }
145 }
146 else // code for actual numeric values
147 {
148 for (uint32_t i = start; i <= finish; ++i)
149 {
150 if (code.empty())
151 {
152 data[i] = 0; // signal that this code maps to itself
153 }
154 else
155 {
156 f = code.begin();
157 l = code.end();
158 parse(f, l, hex, data[i]);
159 }
160 }
161 }
162 ++current;
163 }
164 }
165
166 private:
167
168 static bool ignore_property(std::string const& p)
169 {
170 // We don't handle all properties
171 std::map<std::string, int>& pm = get_property_map();
172 std::map<std::string, int>::iterator i = pm.find(p);
173 return i == pm.end();
174 }
175
176 static int
177 map_property(std::string const& p)
178 {
179 std::map<std::string, int>& pm = get_property_map();
180 std::map<std::string, int>::iterator i = pm.find(p);
181 BOOST_ASSERT(i != pm.end());
182 return i->second;
183 }
184
185 static std::map<std::string, int>&
186 get_property_map()
187 {
188 // The properties we are interested in:
189 static std::map<std::string, int> map;
190 if (map.empty())
191 {
192 // General_Category
193 map["Lu"] = 0;
194 map["Ll"] = 1;
195 map["Lt"] = 2;
196 map["Lm"] = 3;
197 map["Lo"] = 4;
198
199 map["Mn"] = 8;
200 map["Me"] = 9;
201 map["Mc"] = 10;
202
203 map["Nd"] = 16;
204 map["Nl"] = 17;
205 map["No"] = 18;
206
207 map["Zs"] = 24;
208 map["Zl"] = 25;
209 map["Zp"] = 26;
210
211 map["Cc"] = 32;
212 map["Cf"] = 33;
213 map["Co"] = 34;
214 map["Cs"] = 35;
215 map["Cn"] = 36;
216
217 map["Pd"] = 40;
218 map["Ps"] = 41;
219 map["Pe"] = 42;
220 map["Pc"] = 43;
221 map["Po"] = 44;
222 map["Pi"] = 45;
223 map["Pf"] = 46;
224
225 map["Sm"] = 48;
226 map["Sc"] = 49;
227 map["Sk"] = 50;
228 map["So"] = 51;
229
230 // Derived Properties.
231 map["Alphabetic"] = 64;
232 map["Uppercase"] = 128;
233 map["Lowercase"] = 256;
234 map["White_Space"] = 512;
235 map["Hex_Digit"] = 1024;
236 map["Noncharacter_Code_Point"] = 2048;
237 map["Default_Ignorable_Code_Point"] = 4096;
238
239 // Script
240 map["Arabic"] = 0;
241 map["Imperial_Aramaic"] = 1;
242 map["Armenian"] = 2;
243 map["Avestan"] = 3;
244 map["Balinese"] = 4;
245 map["Bamum"] = 5;
246 map["Bengali"] = 6;
247 map["Bopomofo"] = 7;
248 map["Braille"] = 8;
249 map["Buginese"] = 9;
250 map["Buhid"] = 10;
251 map["Canadian_Aboriginal"] = 11;
252 map["Carian"] = 12;
253 map["Cham"] = 13;
254 map["Cherokee"] = 14;
255 map["Coptic"] = 15;
256 map["Cypriot"] = 16;
257 map["Cyrillic"] = 17;
258 map["Devanagari"] = 18;
259 map["Deseret"] = 19;
260 map["Egyptian_Hieroglyphs"] = 20;
261 map["Ethiopic"] = 21;
262 map["Georgian"] = 22;
263 map["Glagolitic"] = 23;
264 map["Gothic"] = 24;
265 map["Greek"] = 25;
266 map["Gujarati"] = 26;
267 map["Gurmukhi"] = 27;
268 map["Hangul"] = 28;
269 map["Han"] = 29;
270 map["Hanunoo"] = 30;
271 map["Hebrew"] = 31;
272 map["Hiragana"] = 32;
273 map["Katakana_Or_Hiragana"] = 33;
274 map["Old_Italic"] = 34;
275 map["Javanese"] = 35;
276 map["Kayah_Li"] = 36;
277 map["Katakana"] = 37;
278 map["Kharoshthi"] = 38;
279 map["Khmer"] = 39;
280 map["Kannada"] = 40;
281 map["Kaithi"] = 41;
282 map["Tai_Tham"] = 42;
283 map["Lao"] = 43;
284 map["Latin"] = 44;
285 map["Lepcha"] = 45;
286 map["Limbu"] = 46;
287 map["Linear_B"] = 47;
288 map["Lisu"] = 48;
289 map["Lycian"] = 49;
290 map["Lydian"] = 50;
291 map["Malayalam"] = 51;
292 map["Mongolian"] = 52;
293 map["Meetei_Mayek"] = 53;
294 map["Myanmar"] = 54;
295 map["Nko"] = 55;
296 map["Ogham"] = 56;
297 map["Ol_Chiki"] = 57;
298 map["Old_Turkic"] = 58;
299 map["Oriya"] = 59;
300 map["Osmanya"] = 60;
301 map["Phags_Pa"] = 61;
302 map["Inscriptional_Pahlavi"] = 62;
303 map["Phoenician"] = 63;
304 map["Inscriptional_Parthian"] = 64;
305 map["Rejang"] = 65;
306 map["Runic"] = 66;
307 map["Samaritan"] = 67;
308 map["Old_South_Arabian"] = 68;
309 map["Saurashtra"] = 69;
310 map["Shavian"] = 70;
311 map["Sinhala"] = 71;
312 map["Sundanese"] = 72;
313 map["Syloti_Nagri"] = 73;
314 map["Syriac"] = 74;
315 map["Tagbanwa"] = 75;
316 map["Tai_Le"] = 76;
317 map["New_Tai_Lue"] = 77;
318 map["Tamil"] = 78;
319 map["Tai_Viet"] = 79;
320 map["Telugu"] = 80;
321 map["Tifinagh"] = 81;
322 map["Tagalog"] = 82;
323 map["Thaana"] = 83;
324 map["Thai"] = 84;
325 map["Tibetan"] = 85;
326 map["Ugaritic"] = 86;
327 map["Vai"] = 87;
328 map["Old_Persian"] = 88;
329 map["Cuneiform"] = 89;
330 map["Yi"] = 90;
331 map["Inherited"] = 91;
332 map["Common"] = 92;
333 map["Unknown"] = 93;
334 }
335 return map;
336 }
337
338 ucd_vector info;
339 };
340
341 template <typename T, uint32_t block_size_ = 256>
342 class ucd_table_builder
343 {
344 public:
345
346 static uint32_t const block_size = block_size_;
347 static uint32_t const full_span = 0x110000;
348 typedef T value_type;
349
350 ucd_table_builder() : p(new T[full_span])
351 {
352 for (uint32_t i = 0; i < full_span; ++i)
353 p[i] = 0;
354 }
355
356 void collect(char const* filename, int field, bool collect_properties = true)
357 {
358 std::cout << "collecting " << filename << std::endl;
359 ucd_info info(filename);
360 info.collect(p, field, collect_properties);
361 }
362
363 void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
364 {
365 std::cout << "building tables" << std::endl;
366 std::map<block_ptr, std::vector<T const*> > blocks;
367 for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
368 blocks[block_ptr(i)].push_back(i);
369
370 // Not enough bits to store the block indices.
371 BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
372
373 typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
374 std::map<T const*, std::vector<T const*> > sorted_blocks;
375 BOOST_FOREACH(blocks_value_type const& val, blocks)
376 {
377 sorted_blocks[val.first.p] = val.second;
378 }
379
380 stage1.clear();
381 stage1.reserve(full_span / block_size);
382 stage1.resize(full_span / block_size);
383 stage2.clear();
384 stage2.reserve(blocks.size());
385
386 typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
387 BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
388 {
389 stage2.push_back(val.first);
390 BOOST_FOREACH(T const* val2, val.second)
391 {
392 stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
393 }
394 }
395 }
396
397 private:
398
399 struct block_ptr
400 {
401 block_ptr(T const* p) : p(p) {}
402
403 friend bool operator<(block_ptr a, block_ptr b)
404 {
405 return std::lexicographical_compare(
406 a.p, a.p + block_size, b.p, b.p + block_size);
407 }
408
409 T const* p;
410 };
411
412 boost::scoped_array<T> p;
413 };
414
415 template <typename Out>
416 void print_tab(Out& out, int tab)
417 {
418 for (int i = 0; i < tab; ++i)
419 out << ' ';
420 }
421
422 template <typename Out, typename C>
423 void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
424 {
425 int const tab = 4;
426 typename C::size_type size = c.size();
427 BOOST_ASSERT(size > 1);
428 print_tab(out, tab);
429 out << std::setw(width) << int(c[0]);
430 for (C::size_type i = 1; i < size; ++i)
431 {
432 out << ", ";
433 if ((i % group) == 0)
434 {
435 out << std::endl;
436 print_tab(out, tab);
437 }
438 out << std::setw(width) << int(c[i]);
439 }
440
441 if (trailing_comma)
442 out << ", " << std::endl;
443 }
444
445 template <typename Out>
446 void print_head(Out& out)
447 {
448 out
449 << "/*=============================================================================\n"
450 << " Copyright (c) 2001-2011 Joel de Guzman\n"
451 << "\n"
452 << " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
453 << " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
454 << "\n"
455 << " AUTOGENERATED. DO NOT EDIT!!!\n"
456 << "==============================================================================*/\n"
457 << "#include <boost/cstdint.hpp>\n"
458 << "\n"
459 << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
460 << "{"
461 ;
462 }
463
464 template <typename Out>
465 void print_tail(Out& out)
466 {
467 out
468 << "\n"
469 << "}}}} // namespace boost::spirit::unicode::detail\n"
470 ;
471 }
472
473 char const* get_int_type_name(int size)
474 {
475 switch (size)
476 {
477 case 1: return "::boost::uint8_t";
478 case 2: return "::boost::uint16_t";
479 case 4: return "::boost::uint32_t";
480 case 5: return "::boost::uint64_t";
481 default: BOOST_ASSERT(false); return 0; // invalid size
482 };
483 }
484
485 template <typename Out, typename Builder>
486 void print_file(Out& out, Builder& builder, int field_width, char const* name)
487 {
488 std::cout << "Generating " << name << " tables" << std::endl;
489
490 uint32_t const block_size = Builder::block_size;
491 typedef typename Builder::value_type value_type;
492 print_head(out);
493
494 std::vector<uint8_t> stage1;
495 std::vector<value_type const*> stage2;
496 builder.build(stage1, stage2);
497 std::cout << "Block Size: " << block_size << std::endl;
498 std::cout << "Total Bytes: "
499 << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
500 << std::endl;
501
502 out
503 << "\n"
504 << " static const ::boost::uint8_t " << name << "_stage1[] = {\n"
505 << "\n"
506 ;
507
508 print_table(out, stage1, false, 3);
509 char const* int_name = get_int_type_name(sizeof(value_type));
510
511 out
512 << "\n"
513 << " };"
514 << "\n"
515 << "\n"
516 << " static const " << int_name << ' ' << name << "_stage2[] = {"
517 ;
518
519 int block_n = 0;
520 for (int i = 0; i < int(stage2.size()); ++i)
521 {
522 value_type const* p = stage2[i];
523 bool last = (i+1 == stage2.size());
524 out << "\n\n // block " << block_n++ << std::endl;
525 print_table(out,
526 boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
527 }
528
529 out
530 << "\n"
531 << " };"
532 << "\n"
533 ;
534
535 out
536 << "\n"
537 << " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
538 << " {\n"
539 << " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
540 << " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
541 << " }\n"
542 ;
543
544 print_tail(out);
545 }
546
547 int main()
548 {
549 // The category tables
550 {
551 std::ofstream out("category_table.hpp");
552 ucd_table_builder<uint16_t, 256> builder;
553 builder.collect("UnicodeData.txt", 2);
554 builder.collect("DerivedCoreProperties.txt", 1);
555 builder.collect("PropList.txt", 1);
556 print_file(out, builder, 4, "category");
557 }
558
559 // The script tables
560 {
561 std::ofstream out("script_table.hpp");
562 ucd_table_builder<uint8_t, 256> builder;
563 builder.collect("Scripts.txt", 1);
564 print_file(out, builder, 3, "script");
565 }
566
567 // The lowercase tables
568 {
569 std::ofstream out("lowercase_table.hpp");
570 ucd_table_builder<uint32_t, 256> builder;
571 builder.collect("UnicodeData.txt", 13, false);
572 print_file(out, builder, 6, "lowercase");
573 }
574
575 // The uppercase tables
576 {
577 std::ofstream out("uppercase_table.hpp");
578 ucd_table_builder<uint32_t, 256> builder;
579 builder.collect("UnicodeData.txt", 12, false);
580 print_file(out, builder, 6, "uppercase");
581 }
582
583 return 0;
584 }