]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 filetype=cpp.doxygen |
2 | ||
3 | // | |
4 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
5 | // | |
6 | // Distributed under the Boost Software License, Version 1.0. (See | |
7 | // accompanying file LICENSE_1_0.txt or copy at | |
8 | // http://www.boost.org/LICENSE_1_0.txt) | |
9 | // | |
10 | ||
11 | /*! | |
12 | \page boundary_analysys Boundary analysis | |
13 | ||
14 | - \ref boundary_analysys_basics | |
15 | - \ref boundary_analysys_segments | |
16 | - \ref boundary_analysys_segments_basics | |
17 | - \ref boundary_analysys_segments_rules | |
18 | - \ref boundary_analysys_segments_search | |
19 | - \ref boundary_analysys_break | |
20 | - \ref boundary_analysys_break_basics | |
21 | - \ref boundary_analysys_break_rules | |
22 | - \ref boundary_analysys_break_search | |
23 | ||
24 | ||
25 | \section boundary_analysys_basics Basics | |
26 | ||
27 | Boost.Locale provides a boundary analysis tool, allowing you to split text into characters, | |
28 | words, or sentences, and find appropriate places for line breaks. | |
29 | ||
30 | \note This task is not a trivial task. | |
31 | \par | |
32 | A Unicode code point and a character are not equivalent, for example: | |
33 | Hebrew word Shalom - "שָלוֹם" that consists of 4 characters and 6 code points (4 base letters and 2 diacritical marks) | |
34 | \par | |
35 | Words may not be separated by space characters in some languages like in Japanese or Chinese. | |
36 | ||
37 | Boost.Locale provides 2 major classes for boundary analysis: | |
38 | ||
39 | - \ref boost::locale::boundary::segment_index - an object that holds an index of segments in the text (like words, characters, | |
40 | sentences). It provides an access to \ref boost::locale::boundary::segment "segment" objects via iterators. | |
41 | - \ref boost::locale::boundary::boundary_point_index - an object that holds an index of boundary points in the text. | |
42 | It allows to iterate over the \ref boost::locale::boundary::boundary_point "boundary_point" objects. | |
43 | ||
44 | Each of the classes above use an iterator type as template parameter. | |
45 | Both of these classes accept in their constructor: | |
46 | ||
47 | - A flag that defines boundary analysis \ref boost::locale::boundary::boundary_type "boundary_type". | |
48 | - The pair of iterators that define the text range that should be analysed | |
49 | - A locale parameter (if not given the global one is used) | |
50 | ||
51 | For example: | |
52 | \code | |
53 | namespace ba=boost::locale::boundary; | |
54 | std::string text= ... ; | |
55 | std::locale loc = ... ; | |
56 | ba::segment_index<std::string::const_iterator> map(ba::word,text.begin(),text.end(),loc); | |
57 | \endcode | |
58 | ||
59 | Each of them provide a members \c begin(), \c end() and \c find() that allow to iterate | |
60 | over the selected segments or boundaries in the text or find a location of a segment or | |
61 | boundary for given iterator. | |
62 | ||
63 | ||
64 | Convenience a typedefs like \ref boost::locale::boundary::ssegment_index "ssegment_index" | |
65 | or \ref boost::locale::boundary::wcboundary_point_index "wcboundary_point_index" provided as well, | |
66 | where "w", "u16" and "u32" prefixes define a character type \c wchar_t, | |
67 | \c char16_t and \c char32_t and "c" and "s" prefixes define whether <tt>std::basic_string<CharType>::const_iterator</tt> | |
68 | or <tt>CharType const *</tt> are used. | |
69 | ||
70 | \section boundary_analysys_segments Iterating Over Segments | |
71 | \section boundary_analysys_segments_basics Basic Iteration | |
72 | ||
73 | The text segments analysis is done using \ref boost::locale::boundary::segment_index "segment_index" class. | |
74 | ||
75 | It provides a bidirectional iterator that returns \ref boost::locale::boundary::segment "segment" object. | |
76 | The segment object represents a pair of iterators that define this segment and a rule according to which it was selected. | |
77 | It can be automatically converted to \c std::basic_string object. | |
78 | ||
79 | To perform boundary analysis, we first create an index object and then iterate over it: | |
80 | ||
81 | For example: | |
82 | ||
83 | \code | |
84 | using namespace boost::locale::boundary; | |
85 | boost::locale::generator gen; | |
86 | std::string text="To be or not to be, that is the question." | |
87 | // Create mapping of text for token iterator using global locale. | |
88 | ssegment_index map(word,text.begin(),text.end(),gen("en_US.UTF-8")); | |
89 | // Print all "words" -- chunks of word boundary | |
90 | for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it) | |
91 | std::cout <<"\""<< * it << "\", "; | |
92 | std::cout << std::endl; | |
93 | \endcode | |
94 | ||
95 | Would print: | |
96 | ||
97 | \verbatim | |
98 | "To", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", ",", " ", "that", " ", "is", " ", "the", " ", "question", ".", | |
99 | \endverbatim | |
100 | ||
101 | This sentence "生きるか死ぬか、それが問題だ。" (<a href="http://tatoeba.org/eng/sentences/show/868189">from Tatoeba database</a>) | |
102 | would be split into following segments in \c ja_JP.UTF-8 (Japanese) locale: | |
103 | ||
104 | \verbatim | |
105 | "生", "きるか", "死", "ぬか", "、", "それが", "問題", "だ", "。", | |
106 | \endverbatim | |
107 | ||
108 | The boundary analysis that is done by Boost.Locale | |
109 | is much more complicated then just splitting the text according | |
110 | to white space characters, even thou it is not perfect. | |
111 | ||
112 | ||
113 | \section boundary_analysys_segments_rules Using Rules | |
114 | ||
115 | The segments selection can be customized using \ref boost::locale::boundary::segment_index::rule(rule_type) "rule()" and | |
116 | \ref boost::locale::boundary::segment_index::full_select(bool) "full_select()" member functions. | |
117 | ||
118 | By default segment_index's iterator return each text segment defined by two boundary points regardless | |
119 | the way they were selected. Thus in the example above we could see text segments like "." or " " | |
120 | that were selected as words. | |
121 | ||
122 | Using a \c rule() member function we can specify a binary mask of rules we want to use for selection of | |
123 | the boundary points using \ref bl_boundary_word_rules "word", \ref bl_boundary_line_rules "line" | |
124 | and \ref bl_boundary_sentence_rules "sentence" boundary rules. | |
125 | ||
126 | For example, by calling | |
127 | ||
128 | \code | |
129 | map.rule(word_any); | |
130 | \endcode | |
131 | ||
132 | Before starting the iteration process, specify a selection mask that fetches: numbers, letter, Kana letters and | |
133 | ideographic characters ignoring all non-word related characters like white space or punctuation marks. | |
134 | ||
135 | So the code: | |
136 | ||
137 | \code | |
138 | using namespace boost::locale::boundary; | |
139 | std::string text="To be or not to be, that is the question." | |
140 | // Create mapping of text for token iterator using global locale. | |
141 | ssegment_index map(word,text.begin(),text.end()); | |
142 | // Define a rule | |
143 | map.rule(word_any); | |
144 | // Print all "words" -- chunks of word boundary | |
145 | for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it) | |
146 | std::cout <<"\""<< * it << "\", "; | |
147 | std::cout << std::endl; | |
148 | \endcode | |
149 | ||
150 | Would print: | |
151 | ||
152 | \verbatim | |
153 | "To", "be", "or", "not", "to", "be", "that", "is", "the", "question", | |
154 | \endverbatim | |
155 | ||
156 | And the for given text="生きるか死ぬか、それが問題だ。" and rule(\ref boost::locale::boundary::word_ideo "word_ideo"), the example above would print. | |
157 | ||
158 | \verbatim | |
159 | "生", "死", "問題", | |
160 | \endverbatim | |
161 | ||
162 | You can access specific rules the segments where selected it using \ref boost::locale::boundary::segment::rule() "segment::rule()" member | |
163 | function. Using a bit-mask of rules. | |
164 | ||
165 | For example: | |
166 | ||
167 | \code | |
168 | boost::locale::generator gen; | |
169 | using namespace boost::locale::boundary; | |
170 | std::string text="生きるか死ぬか、それが問題だ。"; | |
171 | ssegment_index map(word,text.begin(),text.end(),gen("ja_JP.UTF-8")); | |
172 | for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it) { | |
173 | std::cout << "Segment " << *it << " contains: "; | |
174 | if(it->rule() & word_none) | |
175 | std::cout << "white space or punctuation marks "; | |
176 | if(it->rule() & word_kana) | |
177 | std::cout << "kana characters "; | |
178 | if(it->rule() & word_ideo) | |
179 | std::cout << "ideographic characters"; | |
180 | std::cout<< std::endl; | |
181 | } | |
182 | \endcode | |
183 | ||
184 | Would print | |
185 | ||
186 | \verbatim | |
187 | Segment 生 contains: ideographic characters | |
188 | Segment きるか contains: kana characters | |
189 | Segment 死 contains: ideographic characters | |
190 | Segment ぬか contains: kana characters | |
191 | Segment 、 contains: white space or punctuation marks | |
192 | Segment それが contains: kana characters | |
193 | Segment 問題 contains: ideographic characters | |
194 | Segment だ contains: kana characters | |
195 | Segment 。 contains: white space or punctuation marks | |
196 | \endverbatim | |
197 | ||
198 | One important things that should be noted that each segment is defined | |
199 | by a pair of boundaries and the rule of its ending point defines | |
200 | if it is selected or not. | |
201 | ||
202 | In some cases it may be not what we actually look like. | |
203 | ||
204 | For example we have a text: | |
205 | ||
206 | \verbatim | |
207 | Hello! How | |
208 | are you? | |
209 | \endverbatim | |
210 | ||
211 | And we want to fetch all sentences from the text. | |
212 | ||
213 | The \ref bl_boundary_sentence_rules "sentence rules" have two options: | |
214 | ||
215 | - Split the text on the point where sentence terminator like ".!?" detected: \ref boost::locale::boundary::sentence_term "sentence_term" | |
216 | - Split the text on the point where sentence separator like "line feed" detected: \ref boost::locale::boundary::sentence_sep "sentence_sep" | |
217 | ||
218 | Naturally to ignore sentence separators we would call \ref boost::locale::boundary::segment_index::rule(rule_type v) "segment_index::rule(rule_type v)" | |
219 | with sentence_term parameter and then run the iterator. | |
220 | ||
221 | \code | |
222 | boost::locale::generator gen; | |
223 | using namespace boost::locale::boundary; | |
224 | std::string text= "Hello! How\n" | |
225 | "are you?\n"; | |
226 | ssegment_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8")); | |
227 | map.rule(sentence_term); | |
228 | for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it) | |
229 | std::cout << "Sentence [" << *it << "]" << std::endl; | |
230 | \endcode | |
231 | ||
232 | However we would get the expected segments: | |
233 | \verbatim | |
234 | Sentence [Hello! ] | |
235 | Sentence [are you? | |
236 | ] | |
237 | \endverbatim | |
238 | ||
239 | The reason is that "How\n" is still considered a sentence but selected by different | |
240 | rule. | |
241 | ||
242 | This behavior can be changed by setting \ref boost::locale::boundary::segment_index::full_select(bool) "segment_index::full_select(bool)" | |
243 | to \c true. It would force iterator to join the current segment with all previous segments that may not fit the required rule. | |
244 | ||
245 | So we add this line: | |
246 | ||
247 | \code | |
248 | map.full_select(true); | |
249 | \endcode | |
250 | ||
251 | Right after "map.rule(sentence_term);" and get expected output: | |
252 | ||
253 | \verbatim | |
254 | Sentence [Hello! ] | |
255 | Sentence [How | |
256 | are you? | |
257 | ] | |
258 | \endverbatim | |
259 | ||
260 | \subsection boundary_analysys_segments_search Locating Segments | |
261 | ||
262 | Sometimes it is useful to find a segment that some specific iterator is pointing on. | |
263 | ||
264 | For example a user had clicked at specific point, we want to select a word on this | |
265 | location. | |
266 | ||
267 | \ref boost::locale::boundary::segment_index "segment_index" provides | |
268 | \ref boost::locale::boundary::segment_index::find() "find(base_iterator p)" | |
269 | member function for this purpose. | |
270 | ||
271 | This function returns the iterator to the segmet such that \a p points to. | |
272 | ||
273 | ||
274 | For example: | |
275 | ||
276 | \code | |
277 | text="to be or "; | |
278 | ssegment_index map(word,text.begin(),text.end(),gen("en_US.UTF-8")); | |
279 | ssegment_index::iterator p = map.find(text.begin() + 4); | |
280 | if(p!=map.end()) | |
281 | std::cout << *p << std::endl; | |
282 | \endcode | |
283 | ||
284 | Would print: | |
285 | ||
286 | \verbatim | |
287 | be | |
288 | \endverbatim | |
289 | ||
290 | \note | |
291 | ||
292 | if the iterator lays inside the segment this segment returned. If the segment does | |
293 | not fit the selection rules, then the segment following requested position | |
294 | is returned. | |
295 | ||
296 | For example: For \ref boost::locale::boundary::word "word" boundary analysis with \ref boost::locale::boundary::word_any "word_any" rule: | |
297 | ||
298 | - "t|o be or ", would point to "to" - the iterator in the middle of segment "to". | |
299 | - "to |be or ", would point to "be" - the iterator at the beginning of the segment "be" | |
300 | - "to| be or ", would point to "be" - the iterator does is not point to segment with required rule so next valid segment is selected "be". | |
301 | - "to be or| ", would point to end as not valid segment found. | |
302 | ||
303 | ||
304 | \section boundary_analysys_break Iterating Over Boundary Points | |
305 | \section boundary_analysys_break_basics Basic Iteration | |
306 | ||
307 | The \ref boost::locale::boundary::boundary_point_index "boundary_point_index" is similar to | |
308 | \ref boost::locale::boundary::segment_index "segment_index" in its interface but as a different role. | |
309 | Instead of returning text chunks (\ref boost::locale::boundary::segment "segment"s, it returns | |
310 | \ref boost::locale::boundary::boundary_point "boundary_point" object that | |
311 | represents a position in text - a base iterator used that is used for | |
312 | iteration of the source text C++ characters. | |
313 | The \ref boost::locale::boundary::boundary_point "boundary_point" object | |
314 | also provides a \ref boost::locale::boundary::boundary_point::rule() "rule()" member | |
315 | function that defines a rule this boundary was selected according to. | |
316 | ||
317 | \note The beginning and the ending of the text are considered boundary points, so even | |
318 | an empty text consists of at least one boundary point. | |
319 | ||
320 | Lets see an example of selecting first two sentences from a text: | |
321 | ||
322 | \code | |
323 | using namespace boost::locale::boundary; | |
324 | boost::locale::generator gen; | |
325 | ||
326 | // our text sample | |
327 | std::string const text="First sentence. Second sentence! Third one?"; | |
328 | // Create an index | |
329 | sboundary_point_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8")); | |
330 | ||
331 | // Count two boundary points | |
332 | sboundary_point_index::iterator p = map.begin(),e=map.end(); | |
333 | int count = 0; | |
334 | while(p!=e && count < 2) { | |
335 | ++count; | |
336 | ++p; | |
337 | } | |
338 | ||
339 | if(p!=e) { | |
340 | std::cout << "First two sentences are: " | |
341 | << std::string(text.begin(),p->iterator()) | |
342 | << std::endl; | |
343 | } | |
344 | else { | |
345 | std::cout <<"There are less then two sentences in this " | |
346 | <<"text: " << text << std::endl; | |
347 | }\endcode | |
348 | ||
349 | Would print: | |
350 | ||
351 | \verbatim | |
352 | First two sentences are: First sentence. Second sentence! | |
353 | \endverbatim | |
354 | ||
355 | \section boundary_analysys_break_rules Using Rules | |
356 | ||
357 | Similarly to the \ref boost::locale::boundary::segment_index "segment_index" the | |
358 | \ref boost::locale::boundary::boundary_point_index "boundary_point_index" provides | |
359 | a \ref boost::locale::boundary::boundary_point_index::rule(rule_type r) "rule(rule_type mask)" | |
360 | member function to filter boundary points that interest us. | |
361 | ||
362 | It allows to set \ref bl_boundary_word_rules "word", \ref bl_boundary_line_rules "line" | |
363 | and \ref bl_boundary_sentence_rules "sentence" rules for filtering boundary points. | |
364 | ||
365 | Lets change an example above a little: | |
366 | ||
367 | \code | |
368 | // our text sample | |
369 | std::string const text= "First sentence. Second\n" | |
370 | "sentence! Third one?"; | |
371 | \endcode | |
372 | ||
373 | If we run our program as is on the sample above we would get: | |
374 | \verbatim | |
375 | First two sentences are: First sentence. Second | |
376 | \endverbatim | |
377 | ||
378 | Which is not something that we really expected. As the "Second\n" | |
379 | is considered an independent sentence that was separated by | |
380 | a line separator "Line Feed". | |
381 | ||
382 | However, we can set set a rule \ref boost::locale::boundary::sentence_term "sentence_term" | |
383 | and the iterator would use only boundary points that are created | |
384 | by a sentence terminators like ".!?". | |
385 | ||
386 | So by adding: | |
387 | \code | |
388 | map.rule(sentence_term); | |
389 | \endcode | |
390 | ||
391 | Right after the generation of the index we would get the desired output: | |
392 | ||
393 | \verbatim | |
394 | First two sentences are: First sentence. Second | |
395 | sentence! | |
396 | \endverbatim | |
397 | ||
398 | You can also use \ref boost::locale::boundary::boundary_point::rule() "boundary_point::rule()" member | |
399 | function to learn about the reason this boundary point was created by comparing it with an appropriate | |
400 | mask. | |
401 | ||
402 | For example: | |
403 | ||
404 | \code | |
405 | using namespace boost::locale::boundary; | |
406 | boost::locale::generator gen; | |
407 | // our text sample | |
408 | std::string const text= "First sentence. Second\n" | |
409 | "sentence! Third one?"; | |
410 | sboundary_point_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8")); | |
411 | ||
412 | for(sboundary_point_index::iterator p = map.begin(),e=map.end();p!=e;++p) { | |
413 | if(p->rule() & sentence_term) | |
414 | std::cout << "There is a sentence terminator: "; | |
415 | else if(p->rule() & sentence_sep) | |
416 | std::cout << "There is a sentence separator: "; | |
417 | if(p->rule()!=0) // print if some rule exists | |
418 | std::cout << "[" << std::string(text.begin(),p->iterator()) | |
419 | << "|" << std::string(p->iterator(),text.end()) | |
420 | << "]\n"; | |
421 | } | |
422 | \endcode | |
423 | ||
424 | Would give the following output: | |
425 | \verbatim | |
426 | There is a sentence terminator: [First sentence. |Second | |
427 | sentence! Third one?] | |
428 | There is a sentence separator: [First sentence. Second | |
429 | |sentence! Third one?] | |
430 | There is a sentence terminator: [First sentence. Second | |
431 | sentence! |Third one?] | |
432 | There is a sentence terminator: [First sentence. Second | |
433 | sentence! Third one?|] | |
434 | \endverbatim | |
435 | ||
436 | \subsection boundary_analysys_break_search Locating Boundary Points | |
437 | ||
438 | Sometimes it is useful to find a specific boundary point according to given | |
439 | iterator. | |
440 | ||
441 | \ref boost::locale::boundary::boundary_point_index "boundary_point_index" provides | |
442 | a \ref boost::locale::boundary::boundary_point_index::find() "iterator find(base_iterator p)" member | |
443 | function. | |
444 | ||
445 | It would return an iterator to a boundary point on \a p's location or at the | |
446 | location following it if \a p does not point to appropriate position. | |
447 | ||
448 | For example, for word boundary analysis: | |
449 | ||
450 | - If a base iterator points to "to |be", then the returned boundary point would be "to |be" (same position) | |
451 | - If a base iterator points to "t|o be", then the returned boundary point would be "to| be" (next valid position) | |
452 | ||
453 | For example if we want to select 6 words around specific boundary point we can use following code: | |
454 | ||
455 | \code | |
456 | using namespace boost::locale::boundary; | |
457 | boost::locale::generator gen; | |
458 | // our text sample | |
459 | std::string const text= "To be or not to be, that is the question."; | |
460 | ||
461 | // Create a mapping | |
462 | sboundary_point_index map(word,text.begin(),text.end(),gen("en_US.UTF-8")); | |
463 | // Ignore wite space | |
464 | map.rule(word_any); | |
465 | ||
466 | // define our arbitraty point | |
467 | std::string::const_iterator pos = text.begin() + 12; // "no|t"; | |
468 | ||
469 | // Get the search range | |
470 | sboundary_point_index::iterator | |
471 | begin =map.begin(), | |
472 | end = map.end(), | |
473 | it = map.find(pos); // find a boundary | |
474 | ||
475 | // go 3 words backward | |
476 | for(int count = 0;count <3 && it!=begin; count ++) | |
477 | --it; | |
478 | ||
479 | // Save the start | |
480 | std::string::const_iterator start = *it; | |
481 | ||
482 | // go 6 words forward | |
483 | for(int count = 0;count < 6 && it!=end; count ++) | |
484 | ++it; | |
485 | ||
486 | // make sure we at valid position | |
487 | if(it==end) | |
488 | --it; | |
489 | ||
490 | // print the text | |
491 | std::cout << std::string(start,it->iterator()) << std::endl; | |
492 | \endcode | |
493 | ||
494 | That would print: | |
495 | ||
496 | \verbatim | |
497 | be or not to be, that | |
498 | \endverbatim | |
499 | ||
500 | ||
501 | */ | |
502 | ||
503 |