]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | #define BOOST_LOCALE_SOURCE | |
9 | #include <boost/locale/boundary.hpp> | |
10 | #include <boost/locale/generator.hpp> | |
11fdf7f2 | 11 | #include <boost/locale/hold_ptr.hpp> |
7c673cae FG |
12 | #include <unicode/uversion.h> |
13 | #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306 | |
14 | #include <unicode/utext.h> | |
15 | #endif | |
16 | #include <unicode/brkiter.h> | |
17 | #include <unicode/rbbi.h> | |
18 | ||
19 | #include "cdata.hpp" | |
20 | #include "all_generator.hpp" | |
21 | #include "icu_util.hpp" | |
22 | #include "uconv.hpp" | |
23 | ||
24 | namespace boost { | |
25 | namespace locale { | |
26 | namespace boundary { | |
27 | namespace impl_icu { | |
28 | ||
29 | using namespace boost::locale::impl_icu; | |
30 | ||
31 | index_type map_direct(boundary_type t,icu::BreakIterator *it,int reserve) | |
32 | { | |
33 | index_type indx; | |
34 | indx.reserve(reserve); | |
35 | #if U_ICU_VERSION_MAJOR_NUM >= 52 | |
36 | icu::BreakIterator *rbbi=it; | |
37 | #else | |
38 | icu::RuleBasedBreakIterator *rbbi=dynamic_cast<icu::RuleBasedBreakIterator *>(it); | |
39 | #endif | |
40 | ||
41 | indx.push_back(break_info()); | |
42 | it->first(); | |
43 | int pos=0; | |
44 | while((pos=it->next())!=icu::BreakIterator::DONE) { | |
45 | indx.push_back(break_info(pos)); | |
46 | /// Character does not have any specific break types | |
47 | if(t!=character && rbbi) { | |
48 | // | |
49 | // There is a collapse for MSVC: int32_t defined by both boost::cstdint and icu... | |
50 | // So need to pick one ;( | |
51 | // | |
52 | std::vector< ::int32_t> buffer; | |
53 | ::int32_t membuf[8]={0}; // try not to use memory allocation if possible | |
54 | ::int32_t *buf=membuf; | |
55 | ||
56 | UErrorCode err=U_ZERO_ERROR; | |
57 | int n = rbbi->getRuleStatusVec(buf,8,err); | |
58 | ||
59 | if(err == U_BUFFER_OVERFLOW_ERROR) { | |
60 | buf=&buffer.front(); | |
61 | buffer.resize(n,0); | |
62 | n = rbbi->getRuleStatusVec(buf,buffer.size(),err); | |
63 | } | |
64 | ||
65 | check_and_throw_icu_error(err); | |
66 | ||
67 | for(int i=0;i<n;i++) { | |
68 | switch(t) { | |
69 | case word: | |
70 | if(UBRK_WORD_NONE<=buf[i] && buf[i]<UBRK_WORD_NONE_LIMIT) | |
71 | indx.back().rule |= word_none; | |
72 | else if(UBRK_WORD_NUMBER<=buf[i] && buf[i]<UBRK_WORD_NUMBER_LIMIT) | |
73 | indx.back().rule |= word_number; | |
74 | else if(UBRK_WORD_LETTER<=buf[i] && buf[i]<UBRK_WORD_LETTER_LIMIT) | |
75 | indx.back().rule |= word_letter; | |
76 | else if(UBRK_WORD_KANA<=buf[i] && buf[i]<UBRK_WORD_KANA_LIMIT) | |
77 | indx.back().rule |= word_kana; | |
78 | else if(UBRK_WORD_IDEO<=buf[i] && buf[i]<UBRK_WORD_IDEO_LIMIT) | |
79 | indx.back().rule |= word_ideo; | |
80 | break; | |
81 | ||
82 | case line: | |
83 | if(UBRK_LINE_SOFT<=buf[i] && buf[i]<UBRK_LINE_SOFT_LIMIT) | |
84 | indx.back().rule |= line_soft; | |
85 | else if(UBRK_LINE_HARD<=buf[i] && buf[i]<UBRK_LINE_HARD_LIMIT) | |
86 | indx.back().rule |= line_hard; | |
87 | break; | |
88 | ||
89 | case sentence: | |
90 | if(UBRK_SENTENCE_TERM<=buf[i] && buf[i]<UBRK_SENTENCE_TERM_LIMIT) | |
91 | indx.back().rule |= sentence_term; | |
92 | else if(UBRK_SENTENCE_SEP<=buf[i] && buf[i]<UBRK_SENTENCE_SEP_LIMIT) | |
93 | indx.back().rule |= sentence_sep; | |
94 | break; | |
95 | default: | |
96 | ; | |
97 | } | |
98 | } | |
99 | } | |
100 | else { | |
101 | indx.back().rule |=character_any; // Baisc mark... for character | |
102 | } | |
103 | } | |
104 | return indx; | |
105 | } | |
106 | ||
11fdf7f2 | 107 | icu::BreakIterator *get_iterator(boundary_type t,icu::Locale const &loc) |
7c673cae FG |
108 | { |
109 | UErrorCode err=U_ZERO_ERROR; | |
11fdf7f2 | 110 | hold_ptr<icu::BreakIterator> bi; |
7c673cae FG |
111 | switch(t) { |
112 | case character: | |
113 | bi.reset(icu::BreakIterator::createCharacterInstance(loc,err)); | |
114 | break; | |
115 | case word: | |
116 | bi.reset(icu::BreakIterator::createWordInstance(loc,err)); | |
117 | break; | |
118 | case sentence: | |
119 | bi.reset(icu::BreakIterator::createSentenceInstance(loc,err)); | |
120 | break; | |
121 | case line: | |
122 | bi.reset(icu::BreakIterator::createLineInstance(loc,err)); | |
123 | break; | |
124 | default: | |
125 | throw std::runtime_error("Invalid iteration type"); | |
126 | } | |
127 | check_and_throw_icu_error(err); | |
128 | if(!bi.get()) | |
129 | throw std::runtime_error("Failed to create break iterator"); | |
11fdf7f2 | 130 | return bi.release(); |
7c673cae FG |
131 | } |
132 | ||
133 | ||
134 | template<typename CharType> | |
135 | index_type do_map(boundary_type t,CharType const *begin,CharType const *end,icu::Locale const &loc,std::string const &encoding) | |
136 | { | |
137 | index_type indx; | |
11fdf7f2 | 138 | hold_ptr<icu::BreakIterator> bi(get_iterator(t,loc)); |
7c673cae FG |
139 | |
140 | #if U_ICU_VERSION_MAJOR_NUM*100 + U_ICU_VERSION_MINOR_NUM >= 306 | |
141 | UErrorCode err=U_ZERO_ERROR; | |
142 | if(sizeof(CharType) == 2 || (sizeof(CharType)==1 && encoding=="UTF-8")) | |
143 | { | |
144 | UText *ut=0; | |
145 | try { | |
146 | if(sizeof(CharType)==1) | |
147 | ut=utext_openUTF8(0,reinterpret_cast<char const *>(begin),end-begin,&err); | |
148 | else // sizeof(CharType)==2 | |
149 | ut=utext_openUChars(0,reinterpret_cast<UChar const *>(begin),end-begin,&err); | |
150 | ||
151 | check_and_throw_icu_error(err); | |
152 | err=U_ZERO_ERROR; | |
153 | if(!ut) throw std::runtime_error("Failed to create UText"); | |
154 | bi->setText(ut,err); | |
155 | check_and_throw_icu_error(err); | |
156 | index_type res=map_direct(t,bi.get(),end-begin); | |
157 | indx.swap(res); | |
158 | } | |
159 | catch(...) { | |
160 | if(ut) | |
161 | utext_close(ut); | |
162 | throw; | |
163 | } | |
164 | if(ut) utext_close(ut); | |
165 | } | |
166 | else | |
167 | #endif | |
168 | { | |
169 | icu_std_converter<CharType> cvt(encoding); | |
170 | icu::UnicodeString str=cvt.icu(begin,end); | |
171 | bi->setText(str); | |
172 | index_type indirect = map_direct(t,bi.get(),str.length()); | |
173 | indx=indirect; | |
174 | for(size_t i=1;i<indirect.size();i++) { | |
175 | size_t offset_inderect=indirect[i-1].offset; | |
176 | size_t diff = indirect[i].offset - offset_inderect; | |
177 | size_t offset_direct=indx[i-1].offset; | |
178 | indx[i].offset=offset_direct + cvt.cut(str,begin,end,diff,offset_inderect,offset_direct); | |
179 | } | |
180 | } | |
181 | return indx; | |
182 | } // do_map | |
183 | ||
184 | template<typename CharType> | |
185 | class boundary_indexing_impl : public boundary_indexing<CharType> { | |
186 | public: | |
187 | boundary_indexing_impl(cdata const &data) : | |
188 | locale_(data.locale), | |
189 | encoding_(data.encoding) | |
190 | { | |
191 | } | |
192 | index_type map(boundary_type t,CharType const *begin,CharType const *end) const | |
193 | { | |
194 | return do_map<CharType>(t,begin,end,locale_,encoding_); | |
195 | } | |
196 | private: | |
197 | icu::Locale locale_; | |
198 | std::string encoding_; | |
199 | }; | |
200 | ||
201 | ||
202 | ||
203 | } // impl_icu | |
204 | } // boundary | |
205 | ||
206 | namespace impl_icu { | |
207 | std::locale create_boundary(std::locale const &in,cdata const &cd,character_facet_type type) | |
208 | { | |
209 | using namespace boost::locale::boundary::impl_icu; | |
210 | switch(type) { | |
211 | case char_facet: | |
212 | return std::locale(in,new boundary_indexing_impl<char>(cd)); | |
213 | case wchar_t_facet: | |
214 | return std::locale(in,new boundary_indexing_impl<wchar_t>(cd)); | |
215 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T | |
216 | case char16_t_facet: | |
217 | return std::locale(in,new boundary_indexing_impl<char16_t>(cd)); | |
218 | #endif | |
219 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T | |
220 | case char32_t_facet: | |
221 | return std::locale(in,new boundary_indexing_impl<char32_t>(cd)); | |
222 | #endif | |
223 | default: | |
224 | return in; | |
225 | } | |
226 | } | |
227 | } // impl_icu | |
228 | ||
229 | } // locale | |
230 | } // boost | |
231 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 |