]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/locale/doc/boundary_analysys.txt
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / boost / libs / locale / doc / boundary_analysys.txt
1 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 filetype=cpp.doxygen
2
3 //
4 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
5 //
6 // Distributed under the Boost Software License, Version 1.0. (See
7 // accompanying file LICENSE_1_0.txt or copy at
8 // http://www.boost.org/LICENSE_1_0.txt)
9 //
10
11 /*!
12 \page boundary_analysys Boundary analysis
13
14 - \ref boundary_analysys_basics
15 - \ref boundary_analysys_segments
16 - \ref boundary_analysys_segments_basics
17 - \ref boundary_analysys_segments_rules
18 - \ref boundary_analysys_segments_search
19 - \ref boundary_analysys_break
20 - \ref boundary_analysys_break_basics
21 - \ref boundary_analysys_break_rules
22 - \ref boundary_analysys_break_search
23
24
25 \section boundary_analysys_basics Basics
26
27 Boost.Locale provides a boundary analysis tool, allowing you to split text into characters,
28 words, or sentences, and find appropriate places for line breaks.
29
30 \note This task is not a trivial task.
31 \par
32 A Unicode code point and a character are not equivalent, for example:
33 Hebrew word Shalom - "שָלוֹם" that consists of 4 characters and 6 code points (4 base letters and 2 diacritical marks)
34 \par
35 Words may not be separated by space characters in some languages like in Japanese or Chinese.
36
37 Boost.Locale provides 2 major classes for boundary analysis:
38
39 - \ref boost::locale::boundary::segment_index - an object that holds an index of segments in the text (like words, characters,
40 sentences). It provides an access to \ref boost::locale::boundary::segment "segment" objects via iterators.
41 - \ref boost::locale::boundary::boundary_point_index - an object that holds an index of boundary points in the text.
42 It allows to iterate over the \ref boost::locale::boundary::boundary_point "boundary_point" objects.
43
44 Each of the classes above use an iterator type as template parameter.
45 Both of these classes accept in their constructor:
46
47 - A flag that defines boundary analysis \ref boost::locale::boundary::boundary_type "boundary_type".
48 - The pair of iterators that define the text range that should be analysed
49 - A locale parameter (if not given the global one is used)
50
51 For example:
52 \code
53 namespace ba=boost::locale::boundary;
54 std::string text= ... ;
55 std::locale loc = ... ;
56 ba::segment_index<std::string::const_iterator> map(ba::word,text.begin(),text.end(),loc);
57 \endcode
58
59 Each of them provide a members \c begin(), \c end() and \c find() that allow to iterate
60 over the selected segments or boundaries in the text or find a location of a segment or
61 boundary for given iterator.
62
63
64 Convenience a typedefs like \ref boost::locale::boundary::ssegment_index "ssegment_index"
65 or \ref boost::locale::boundary::wcboundary_point_index "wcboundary_point_index" provided as well,
66 where "w", "u16" and "u32" prefixes define a character type \c wchar_t,
67 \c char16_t and \c char32_t and "c" and "s" prefixes define whether <tt>std::basic_string<CharType>::const_iterator</tt>
68 or <tt>CharType const *</tt> are used.
69
70 \section boundary_analysys_segments Iterating Over Segments
71 \section boundary_analysys_segments_basics Basic Iteration
72
73 The text segments analysis is done using \ref boost::locale::boundary::segment_index "segment_index" class.
74
75 It provides a bidirectional iterator that returns \ref boost::locale::boundary::segment "segment" object.
76 The segment object represents a pair of iterators that define this segment and a rule according to which it was selected.
77 It can be automatically converted to \c std::basic_string object.
78
79 To perform boundary analysis, we first create an index object and then iterate over it:
80
81 For example:
82
83 \code
84 using namespace boost::locale::boundary;
85 boost::locale::generator gen;
86 std::string text="To be or not to be, that is the question."
87 // Create mapping of text for token iterator using global locale.
88 ssegment_index map(word,text.begin(),text.end(),gen("en_US.UTF-8"));
89 // Print all "words" -- chunks of word boundary
90 for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it)
91 std::cout <<"\""<< * it << "\", ";
92 std::cout << std::endl;
93 \endcode
94
95 Would print:
96
97 \verbatim
98 "To", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", ",", " ", "that", " ", "is", " ", "the", " ", "question", ".",
99 \endverbatim
100
101 This sentence "生きるか死ぬか、それが問題だ。" (<a href="http://tatoeba.org/eng/sentences/show/868189">from Tatoeba database</a>)
102 would be split into following segments in \c ja_JP.UTF-8 (Japanese) locale:
103
104 \verbatim
105 "生", "きるか", "死", "ぬか", "、", "それが", "問題", "だ", "。",
106 \endverbatim
107
108 The boundary analysis that is done by Boost.Locale
109 is much more complicated then just splitting the text according
110 to white space characters, even thou it is not perfect.
111
112
113 \section boundary_analysys_segments_rules Using Rules
114
115 The segments selection can be customized using \ref boost::locale::boundary::segment_index::rule(rule_type) "rule()" and
116 \ref boost::locale::boundary::segment_index::full_select(bool) "full_select()" member functions.
117
118 By default segment_index's iterator return each text segment defined by two boundary points regardless
119 the way they were selected. Thus in the example above we could see text segments like "." or " "
120 that were selected as words.
121
122 Using a \c rule() member function we can specify a binary mask of rules we want to use for selection of
123 the boundary points using \ref bl_boundary_word_rules "word", \ref bl_boundary_line_rules "line"
124 and \ref bl_boundary_sentence_rules "sentence" boundary rules.
125
126 For example, by calling
127
128 \code
129 map.rule(word_any);
130 \endcode
131
132 Before starting the iteration process, specify a selection mask that fetches: numbers, letter, Kana letters and
133 ideographic characters ignoring all non-word related characters like white space or punctuation marks.
134
135 So the code:
136
137 \code
138 using namespace boost::locale::boundary;
139 std::string text="To be or not to be, that is the question."
140 // Create mapping of text for token iterator using global locale.
141 ssegment_index map(word,text.begin(),text.end());
142 // Define a rule
143 map.rule(word_any);
144 // Print all "words" -- chunks of word boundary
145 for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it)
146 std::cout <<"\""<< * it << "\", ";
147 std::cout << std::endl;
148 \endcode
149
150 Would print:
151
152 \verbatim
153 "To", "be", "or", "not", "to", "be", "that", "is", "the", "question",
154 \endverbatim
155
156 And the for given text="生きるか死ぬか、それが問題だ。" and rule(\ref boost::locale::boundary::word_ideo "word_ideo"), the example above would print.
157
158 \verbatim
159 "生", "死", "問題",
160 \endverbatim
161
162 You can access specific rules the segments where selected it using \ref boost::locale::boundary::segment::rule() "segment::rule()" member
163 function. Using a bit-mask of rules.
164
165 For example:
166
167 \code
168 boost::locale::generator gen;
169 using namespace boost::locale::boundary;
170 std::string text="生きるか死ぬか、それが問題だ。";
171 ssegment_index map(word,text.begin(),text.end(),gen("ja_JP.UTF-8"));
172 for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it) {
173 std::cout << "Segment " << *it << " contains: ";
174 if(it->rule() & word_none)
175 std::cout << "white space or punctuation marks ";
176 if(it->rule() & word_kana)
177 std::cout << "kana characters ";
178 if(it->rule() & word_ideo)
179 std::cout << "ideographic characters";
180 std::cout<< std::endl;
181 }
182 \endcode
183
184 Would print
185
186 \verbatim
187 Segment 生 contains: ideographic characters
188 Segment きるか contains: kana characters
189 Segment 死 contains: ideographic characters
190 Segment ぬか contains: kana characters
191 Segment 、 contains: white space or punctuation marks
192 Segment それが contains: kana characters
193 Segment 問題 contains: ideographic characters
194 Segment だ contains: kana characters
195 Segment 。 contains: white space or punctuation marks
196 \endverbatim
197
198 One important things that should be noted that each segment is defined
199 by a pair of boundaries and the rule of its ending point defines
200 if it is selected or not.
201
202 In some cases it may be not what we actually look like.
203
204 For example we have a text:
205
206 \verbatim
207 Hello! How
208 are you?
209 \endverbatim
210
211 And we want to fetch all sentences from the text.
212
213 The \ref bl_boundary_sentence_rules "sentence rules" have two options:
214
215 - Split the text on the point where sentence terminator like ".!?" detected: \ref boost::locale::boundary::sentence_term "sentence_term"
216 - Split the text on the point where sentence separator like "line feed" detected: \ref boost::locale::boundary::sentence_sep "sentence_sep"
217
218 Naturally to ignore sentence separators we would call \ref boost::locale::boundary::segment_index::rule(rule_type v) "segment_index::rule(rule_type v)"
219 with sentence_term parameter and then run the iterator.
220
221 \code
222 boost::locale::generator gen;
223 using namespace boost::locale::boundary;
224 std::string text= "Hello! How\n"
225 "are you?\n";
226 ssegment_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8"));
227 map.rule(sentence_term);
228 for(ssegment_index::iterator it=map.begin(),e=map.end();it!=e;++it)
229 std::cout << "Sentence [" << *it << "]" << std::endl;
230 \endcode
231
232 However we would get the expected segments:
233 \verbatim
234 Sentence [Hello! ]
235 Sentence [are you?
236 ]
237 \endverbatim
238
239 The reason is that "How\n" is still considered a sentence but selected by different
240 rule.
241
242 This behavior can be changed by setting \ref boost::locale::boundary::segment_index::full_select(bool) "segment_index::full_select(bool)"
243 to \c true. It would force iterator to join the current segment with all previous segments that may not fit the required rule.
244
245 So we add this line:
246
247 \code
248 map.full_select(true);
249 \endcode
250
251 Right after "map.rule(sentence_term);" and get expected output:
252
253 \verbatim
254 Sentence [Hello! ]
255 Sentence [How
256 are you?
257 ]
258 \endverbatim
259
260 \subsection boundary_analysys_segments_search Locating Segments
261
262 Sometimes it is useful to find a segment that some specific iterator is pointing on.
263
264 For example a user had clicked at specific point, we want to select a word on this
265 location.
266
267 \ref boost::locale::boundary::segment_index "segment_index" provides
268 \ref boost::locale::boundary::segment_index::find() "find(base_iterator p)"
269 member function for this purpose.
270
271 This function returns the iterator to the segmet such that \a p points to.
272
273
274 For example:
275
276 \code
277 text="to be or ";
278 ssegment_index map(word,text.begin(),text.end(),gen("en_US.UTF-8"));
279 ssegment_index::iterator p = map.find(text.begin() + 4);
280 if(p!=map.end())
281 std::cout << *p << std::endl;
282 \endcode
283
284 Would print:
285
286 \verbatim
287 be
288 \endverbatim
289
290 \note
291
292 if the iterator lays inside the segment this segment returned. If the segment does
293 not fit the selection rules, then the segment following requested position
294 is returned.
295
296 For example: For \ref boost::locale::boundary::word "word" boundary analysis with \ref boost::locale::boundary::word_any "word_any" rule:
297
298 - "t|o be or ", would point to "to" - the iterator in the middle of segment "to".
299 - "to |be or ", would point to "be" - the iterator at the beginning of the segment "be"
300 - "to| be or ", would point to "be" - the iterator does is not point to segment with required rule so next valid segment is selected "be".
301 - "to be or| ", would point to end as not valid segment found.
302
303
304 \section boundary_analysys_break Iterating Over Boundary Points
305 \section boundary_analysys_break_basics Basic Iteration
306
307 The \ref boost::locale::boundary::boundary_point_index "boundary_point_index" is similar to
308 \ref boost::locale::boundary::segment_index "segment_index" in its interface but as a different role.
309 Instead of returning text chunks (\ref boost::locale::boundary::segment "segment"s, it returns
310 \ref boost::locale::boundary::boundary_point "boundary_point" object that
311 represents a position in text - a base iterator used that is used for
312 iteration of the source text C++ characters.
313 The \ref boost::locale::boundary::boundary_point "boundary_point" object
314 also provides a \ref boost::locale::boundary::boundary_point::rule() "rule()" member
315 function that defines a rule this boundary was selected according to.
316
317 \note The beginning and the ending of the text are considered boundary points, so even
318 an empty text consists of at least one boundary point.
319
320 Lets see an example of selecting first two sentences from a text:
321
322 \code
323 using namespace boost::locale::boundary;
324 boost::locale::generator gen;
325
326 // our text sample
327 std::string const text="First sentence. Second sentence! Third one?";
328 // Create an index
329 sboundary_point_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8"));
330
331 // Count two boundary points
332 sboundary_point_index::iterator p = map.begin(),e=map.end();
333 int count = 0;
334 while(p!=e && count < 2) {
335 ++count;
336 ++p;
337 }
338
339 if(p!=e) {
340 std::cout << "First two sentences are: "
341 << std::string(text.begin(),p->iterator())
342 << std::endl;
343 }
344 else {
345 std::cout <<"There are less then two sentences in this "
346 <<"text: " << text << std::endl;
347 }\endcode
348
349 Would print:
350
351 \verbatim
352 First two sentences are: First sentence. Second sentence!
353 \endverbatim
354
355 \section boundary_analysys_break_rules Using Rules
356
357 Similarly to the \ref boost::locale::boundary::segment_index "segment_index" the
358 \ref boost::locale::boundary::boundary_point_index "boundary_point_index" provides
359 a \ref boost::locale::boundary::boundary_point_index::rule(rule_type r) "rule(rule_type mask)"
360 member function to filter boundary points that interest us.
361
362 It allows to set \ref bl_boundary_word_rules "word", \ref bl_boundary_line_rules "line"
363 and \ref bl_boundary_sentence_rules "sentence" rules for filtering boundary points.
364
365 Lets change an example above a little:
366
367 \code
368 // our text sample
369 std::string const text= "First sentence. Second\n"
370 "sentence! Third one?";
371 \endcode
372
373 If we run our program as is on the sample above we would get:
374 \verbatim
375 First two sentences are: First sentence. Second
376 \endverbatim
377
378 Which is not something that we really expected. As the "Second\n"
379 is considered an independent sentence that was separated by
380 a line separator "Line Feed".
381
382 However, we can set set a rule \ref boost::locale::boundary::sentence_term "sentence_term"
383 and the iterator would use only boundary points that are created
384 by a sentence terminators like ".!?".
385
386 So by adding:
387 \code
388 map.rule(sentence_term);
389 \endcode
390
391 Right after the generation of the index we would get the desired output:
392
393 \verbatim
394 First two sentences are: First sentence. Second
395 sentence!
396 \endverbatim
397
398 You can also use \ref boost::locale::boundary::boundary_point::rule() "boundary_point::rule()" member
399 function to learn about the reason this boundary point was created by comparing it with an appropriate
400 mask.
401
402 For example:
403
404 \code
405 using namespace boost::locale::boundary;
406 boost::locale::generator gen;
407 // our text sample
408 std::string const text= "First sentence. Second\n"
409 "sentence! Third one?";
410 sboundary_point_index map(sentence,text.begin(),text.end(),gen("en_US.UTF-8"));
411
412 for(sboundary_point_index::iterator p = map.begin(),e=map.end();p!=e;++p) {
413 if(p->rule() & sentence_term)
414 std::cout << "There is a sentence terminator: ";
415 else if(p->rule() & sentence_sep)
416 std::cout << "There is a sentence separator: ";
417 if(p->rule()!=0) // print if some rule exists
418 std::cout << "[" << std::string(text.begin(),p->iterator())
419 << "|" << std::string(p->iterator(),text.end())
420 << "]\n";
421 }
422 \endcode
423
424 Would give the following output:
425 \verbatim
426 There is a sentence terminator: [First sentence. |Second
427 sentence! Third one?]
428 There is a sentence separator: [First sentence. Second
429 |sentence! Third one?]
430 There is a sentence terminator: [First sentence. Second
431 sentence! |Third one?]
432 There is a sentence terminator: [First sentence. Second
433 sentence! Third one?|]
434 \endverbatim
435
436 \subsection boundary_analysys_break_search Locating Boundary Points
437
438 Sometimes it is useful to find a specific boundary point according to given
439 iterator.
440
441 \ref boost::locale::boundary::boundary_point_index "boundary_point_index" provides
442 a \ref boost::locale::boundary::boundary_point_index::find() "iterator find(base_iterator p)" member
443 function.
444
445 It would return an iterator to a boundary point on \a p's location or at the
446 location following it if \a p does not point to appropriate position.
447
448 For example, for word boundary analysis:
449
450 - If a base iterator points to "to |be", then the returned boundary point would be "to |be" (same position)
451 - If a base iterator points to "t|o be", then the returned boundary point would be "to| be" (next valid position)
452
453 For example if we want to select 6 words around specific boundary point we can use following code:
454
455 \code
456 using namespace boost::locale::boundary;
457 boost::locale::generator gen;
458 // our text sample
459 std::string const text= "To be or not to be, that is the question.";
460
461 // Create a mapping
462 sboundary_point_index map(word,text.begin(),text.end(),gen("en_US.UTF-8"));
463 // Ignore wite space
464 map.rule(word_any);
465
466 // define our arbitraty point
467 std::string::const_iterator pos = text.begin() + 12; // "no|t";
468
469 // Get the search range
470 sboundary_point_index::iterator
471 begin =map.begin(),
472 end = map.end(),
473 it = map.find(pos); // find a boundary
474
475 // go 3 words backward
476 for(int count = 0;count <3 && it!=begin; count ++)
477 --it;
478
479 // Save the start
480 std::string::const_iterator start = *it;
481
482 // go 6 words forward
483 for(int count = 0;count < 6 && it!=end; count ++)
484 ++it;
485
486 // make sure we at valid position
487 if(it==end)
488 --it;
489
490 // print the text
491 std::cout << std::string(start,it->iterator()) << std::endl;
492 \endcode
493
494 That would print:
495
496 \verbatim
497 be or not to be, that
498 \endverbatim
499
500
501 */
502
503