1 /*=============================================================================
2 Copyright (c) 2001-2011 Joel de Guzman
4 Distributed under the Boost Software License, Version 1.0. (See accompanying
5 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 =============================================================================*/
7 #include <boost/config/warning_disable.hpp>
8 #include <boost/spirit/include/qi.hpp>
9 #include <boost/spirit/include/phoenix.hpp>
10 #include <boost/unordered_map.hpp>
11 #include <boost/algorithm/string/trim.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/foreach.hpp>
14 #include <boost/array.hpp>
15 #include <boost/scoped_array.hpp>
16 #include <boost/range/iterator_range.hpp>
26 // We place the data here. Each line comprises various fields
27 typedef std::vector
<std::string
> ucd_line
;
28 typedef std::vector
<ucd_line
> ucd_vector
;
29 typedef std::vector
<ucd_line
>::iterator ucd_iterator
;
31 // spirit and phoenix using declarations
32 using boost::spirit::qi::parse
;
33 using boost::spirit::qi::hex
;
34 using boost::spirit::qi::char_
;
35 using boost::spirit::qi::eol
;
36 using boost::spirit::qi::rule
;
37 using boost::spirit::qi::omit
;
38 using boost::spirit::qi::_1
;
39 using boost::spirit::qi::_val
;
40 using boost::phoenix::push_back
;
41 using boost::phoenix::ref
;
43 // basic unsigned types
45 using boost::uint16_t;
46 using boost::uint32_t;
51 ucd_range(uint32_t start
, uint32_t finish
)
52 : start(start
), finish(finish
) {}
54 // we need this so we can use ucd_range as a multimap key
55 friend bool operator<(ucd_range
const& a
, ucd_range
const& b
)
57 return a
.start
< b
.start
;
68 ucd_info(char const* filename
)
70 std::ifstream
in(filename
, std::ios_base::in
);
73 std::cerr
<< "Error: Could not open input file: "
74 << filename
<< std::endl
;
78 std::string data
; // We will read the contents here.
79 in
.unsetf(std::ios::skipws
); // No white space skipping!
81 std::istream_iterator
<char>(in
),
82 std::istream_iterator
<char>(),
83 std::back_inserter(data
));
85 typedef std::string::const_iterator iterator_type
;
86 iterator_type f
= data
.begin();
87 iterator_type l
= data
.end();
89 rule
<iterator_type
> endl
= -('#' >> *(char_
-eol
)) >> eol
;
90 rule
<iterator_type
, std::string()> field
= *(char_
-(';'|endl
)) >> (';'|&endl
);
91 rule
<iterator_type
, ucd_line()> line
= +(field
-endl
) >> endl
;
92 rule
<iterator_type
, std::vector
<ucd_line
>()> file
= +(endl
| line
[push_back(_val
, _1
)]);
94 parse(f
, l
, file
, info
);
98 template <typename Array
>
99 void collect(Array
& data
, int field
, bool collect_properties
= true) const
101 BOOST_ASSERT(!info
.empty());
102 ucd_vector::const_iterator current
= info
.begin();
103 ucd_vector::const_iterator end
= info
.end();
105 while (current
!= end
)
107 std::string range
= (*current
)[0];
110 std::string::const_iterator f
= range
.begin();
111 std::string::const_iterator l
= range
.end();
113 // get the code-point range
116 parse(f
, l
, hex
[ref(start
) = ref(finish
) = _1
] >> -(".." >> hex
[ref(finish
) = _1
]));
118 // special case for UnicodeData.txt ranges:
119 if ((*current
)[1].find("First>") != std::string::npos
)
122 BOOST_ASSERT(current
!= end
);
123 BOOST_ASSERT((*current
)[1].find("Last>") != std::string::npos
);
125 std::string range
= (*current
)[0];
130 parse(f
, l
, hex
[ref(finish
) = _1
]);
134 if (field
< int(current
->size()))
135 code
= (*current
)[field
];
137 // Only collect properties we are interested in
138 if (collect_properties
) // code for properties
140 if (!ignore_property(code
))
142 for (uint32_t i
= start
; i
<= finish
; ++i
)
143 data
[i
] |= map_property(code
);
146 else // code for actual numeric values
148 for (uint32_t i
= start
; i
<= finish
; ++i
)
152 data
[i
] = 0; // signal that this code maps to itself
158 parse(f
, l
, hex
, data
[i
]);
168 static bool ignore_property(std::string
const& p
)
170 // We don't handle all properties
171 std::map
<std::string
, int>& pm
= get_property_map();
172 std::map
<std::string
, int>::iterator i
= pm
.find(p
);
173 return i
== pm
.end();
177 map_property(std::string
const& p
)
179 std::map
<std::string
, int>& pm
= get_property_map();
180 std::map
<std::string
, int>::iterator i
= pm
.find(p
);
181 BOOST_ASSERT(i
!= pm
.end());
185 static std::map
<std::string
, int>&
188 // The properties we are interested in:
189 static std::map
<std::string
, int> map
;
230 // Derived Properties.
231 map
["Alphabetic"] = 64;
232 map
["Uppercase"] = 128;
233 map
["Lowercase"] = 256;
234 map
["White_Space"] = 512;
235 map
["Hex_Digit"] = 1024;
236 map
["Noncharacter_Code_Point"] = 2048;
237 map
["Default_Ignorable_Code_Point"] = 4096;
241 map
["Imperial_Aramaic"] = 1;
251 map
["Canadian_Aboriginal"] = 11;
254 map
["Cherokee"] = 14;
257 map
["Cyrillic"] = 17;
258 map
["Devanagari"] = 18;
260 map
["Egyptian_Hieroglyphs"] = 20;
261 map
["Ethiopic"] = 21;
262 map
["Georgian"] = 22;
263 map
["Glagolitic"] = 23;
266 map
["Gujarati"] = 26;
267 map
["Gurmukhi"] = 27;
272 map
["Hiragana"] = 32;
273 map
["Katakana_Or_Hiragana"] = 33;
274 map
["Old_Italic"] = 34;
275 map
["Javanese"] = 35;
276 map
["Kayah_Li"] = 36;
277 map
["Katakana"] = 37;
278 map
["Kharoshthi"] = 38;
282 map
["Tai_Tham"] = 42;
287 map
["Linear_B"] = 47;
291 map
["Malayalam"] = 51;
292 map
["Mongolian"] = 52;
293 map
["Meetei_Mayek"] = 53;
297 map
["Ol_Chiki"] = 57;
298 map
["Old_Turkic"] = 58;
301 map
["Phags_Pa"] = 61;
302 map
["Inscriptional_Pahlavi"] = 62;
303 map
["Phoenician"] = 63;
304 map
["Inscriptional_Parthian"] = 64;
307 map
["Samaritan"] = 67;
308 map
["Old_South_Arabian"] = 68;
309 map
["Saurashtra"] = 69;
312 map
["Sundanese"] = 72;
313 map
["Syloti_Nagri"] = 73;
315 map
["Tagbanwa"] = 75;
317 map
["New_Tai_Lue"] = 77;
319 map
["Tai_Viet"] = 79;
321 map
["Tifinagh"] = 81;
326 map
["Ugaritic"] = 86;
328 map
["Old_Persian"] = 88;
329 map
["Cuneiform"] = 89;
331 map
["Inherited"] = 91;
341 template <typename T
, uint32_t block_size_
= 256>
342 class ucd_table_builder
346 static uint32_t const block_size
= block_size_
;
347 static uint32_t const full_span
= 0x110000;
348 typedef T value_type
;
350 ucd_table_builder() : p(new T
[full_span
])
352 for (uint32_t i
= 0; i
< full_span
; ++i
)
356 void collect(char const* filename
, int field
, bool collect_properties
= true)
358 std::cout
<< "collecting " << filename
<< std::endl
;
359 ucd_info
info(filename
);
360 info
.collect(p
, field
, collect_properties
);
363 void build(std::vector
<uint8_t>& stage1
, std::vector
<T
const*>& stage2
)
365 std::cout
<< "building tables" << std::endl
;
366 std::map
<block_ptr
, std::vector
<T
const*> > blocks
;
367 for (T
const* i
= p
.get(); i
< (p
.get() + full_span
); i
+= block_size
)
368 blocks
[block_ptr(i
)].push_back(i
);
370 // Not enough bits to store the block indices.
371 BOOST_ASSERT(blocks
.size() < (1 << (sizeof(uint8_t) * 8)));
373 typedef std::pair
<block_ptr
, std::vector
<T
const*> > blocks_value_type
;
374 std::map
<T
const*, std::vector
<T
const*> > sorted_blocks
;
375 BOOST_FOREACH(blocks_value_type
const& val
, blocks
)
377 sorted_blocks
[val
.first
.p
] = val
.second
;
381 stage1
.reserve(full_span
/ block_size
);
382 stage1
.resize(full_span
/ block_size
);
384 stage2
.reserve(blocks
.size());
386 typedef std::pair
<T
const*, std::vector
<T
const*> > sorted_blocks_value_type
;
387 BOOST_FOREACH(sorted_blocks_value_type
const& val
, sorted_blocks
)
389 stage2
.push_back(val
.first
);
390 BOOST_FOREACH(T
const* val2
, val
.second
)
392 stage1
[(val2
- p
.get()) / block_size
] = stage2
.size() - 1;
401 block_ptr(T
const* p
) : p(p
) {}
403 friend bool operator<(block_ptr a
, block_ptr b
)
405 return std::lexicographical_compare(
406 a
.p
, a
.p
+ block_size
, b
.p
, b
.p
+ block_size
);
412 boost::scoped_array
<T
> p
;
415 template <typename Out
>
416 void print_tab(Out
& out
, int tab
)
418 for (int i
= 0; i
< tab
; ++i
)
422 template <typename Out
, typename C
>
423 void print_table(Out
& out
, C
const& c
, bool trailing_comma
, int width
= 4, int group
= 16)
426 typename
C::size_type size
= c
.size();
427 BOOST_ASSERT(size
> 1);
429 out
<< std::setw(width
) << int(c
[0]);
430 for (C::size_type i
= 1; i
< size
; ++i
)
433 if ((i
% group
) == 0)
438 out
<< std::setw(width
) << int(c
[i
]);
442 out
<< ", " << std::endl
;
445 template <typename Out
>
446 void print_head(Out
& out
)
449 << "/*=============================================================================\n"
450 << " Copyright (c) 2001-2011 Joel de Guzman\n"
452 << " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
453 << " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
455 << " AUTOGENERATED. DO NOT EDIT!!!\n"
456 << "==============================================================================*/\n"
457 << "#include <boost/cstdint.hpp>\n"
459 << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
464 template <typename Out
>
465 void print_tail(Out
& out
)
469 << "}}}} // namespace boost::spirit::unicode::detail\n"
473 char const* get_int_type_name(int size
)
477 case 1: return "::boost::uint8_t";
478 case 2: return "::boost::uint16_t";
479 case 4: return "::boost::uint32_t";
480 case 5: return "::boost::uint64_t";
481 default: BOOST_ASSERT(false); return 0; // invalid size
485 template <typename Out
, typename Builder
>
486 void print_file(Out
& out
, Builder
& builder
, int field_width
, char const* name
)
488 std::cout
<< "Generating " << name
<< " tables" << std::endl
;
490 uint32_t const block_size
= Builder::block_size
;
491 typedef typename
Builder::value_type value_type
;
494 std::vector
<uint8_t> stage1
;
495 std::vector
<value_type
const*> stage2
;
496 builder
.build(stage1
, stage2
);
497 std::cout
<< "Block Size: " << block_size
<< std::endl
;
498 std::cout
<< "Total Bytes: "
499 << stage1
.size()+(stage2
.size()*block_size
*sizeof(value_type
))
504 << " static const ::boost::uint8_t " << name
<< "_stage1[] = {\n"
508 print_table(out
, stage1
, false, 3);
509 char const* int_name
= get_int_type_name(sizeof(value_type
));
516 << " static const " << int_name
<< ' ' << name
<< "_stage2[] = {"
520 for (int i
= 0; i
< int(stage2
.size()); ++i
)
522 value_type
const* p
= stage2
[i
];
523 bool last
= (i
+1 == stage2
.size());
524 out
<< "\n\n // block " << block_n
++ << std::endl
;
526 boost::iterator_range
<value_type
const*>(p
, p
+block_size
), !last
, field_width
);
537 << " inline " << int_name
<< ' ' << name
<< "_lookup(::boost::uint32_t ch)\n"
539 << " ::boost::uint32_t block_offset = " << name
<< "_stage1[ch / " << block_size
<< "] * " << block_size
<< ";\n"
540 << " return " << name
<< "_stage2[block_offset + ch % " << block_size
<< "];\n"
549 // The category tables
551 std::ofstream
out("category_table.hpp");
552 ucd_table_builder
<uint16_t, 256> builder
;
553 builder
.collect("UnicodeData.txt", 2);
554 builder
.collect("DerivedCoreProperties.txt", 1);
555 builder
.collect("PropList.txt", 1);
556 print_file(out
, builder
, 4, "category");
561 std::ofstream
out("script_table.hpp");
562 ucd_table_builder
<uint8_t, 256> builder
;
563 builder
.collect("Scripts.txt", 1);
564 print_file(out
, builder
, 3, "script");
567 // The lowercase tables
569 std::ofstream
out("lowercase_table.hpp");
570 ucd_table_builder
<uint32_t, 256> builder
;
571 builder
.collect("UnicodeData.txt", 13, false);
572 print_file(out
, builder
, 6, "lowercase");
575 // The uppercase tables
577 std::ofstream
out("uppercase_table.hpp");
578 ucd_table_builder
<uint32_t, 256> builder
;
579 builder
.collect("UnicodeData.txt", 12, false);
580 print_file(out
, builder
, 6, "uppercase");