]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0. (See | |
5 | // accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt) | |
7 | // | |
8 | #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED | |
9 | #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED | |
10 | ||
11 | #include <boost/locale/config.hpp> | |
12 | #include <boost/locale/boundary/types.hpp> | |
13 | #include <boost/locale/boundary/facets.hpp> | |
14 | #include <boost/locale/boundary/segment.hpp> | |
15 | #include <boost/locale/boundary/boundary_point.hpp> | |
16 | #include <boost/iterator/iterator_facade.hpp> | |
17 | #include <boost/type_traits/is_same.hpp> | |
18 | #include <boost/shared_ptr.hpp> | |
19 | #include <boost/cstdint.hpp> | |
20 | #include <boost/assert.hpp> | |
21 | #ifdef BOOST_MSVC | |
22 | # pragma warning(push) | |
23 | # pragma warning(disable : 4275 4251 4231 4660) | |
24 | #endif | |
25 | #include <string> | |
26 | #include <locale> | |
27 | #include <vector> | |
28 | #include <iterator> | |
29 | #include <algorithm> | |
30 | #include <stdexcept> | |
31 | ||
32 | #include <iostream> | |
33 | ||
34 | namespace boost { | |
35 | ||
36 | namespace locale { | |
37 | ||
38 | namespace boundary { | |
39 | /// | |
40 | /// \defgroup boundary Boundary Analysis | |
41 | /// | |
42 | /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries | |
43 | /// | |
44 | /// @{ | |
45 | /// | |
46 | ||
47 | /// \cond INTERNAL | |
48 | ||
49 | namespace details { | |
50 | ||
51 | template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category> | |
52 | struct mapping_traits { | |
53 | typedef typename std::iterator_traits<IteratorType>::value_type char_type; | |
54 | static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) | |
55 | { | |
56 | std::basic_string<char_type> str(b,e); | |
57 | return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); | |
58 | } | |
59 | }; | |
60 | ||
61 | template<typename CharType,typename SomeIteratorType> | |
62 | struct linear_iterator_traits { | |
63 | static const bool is_linear = | |
64 | is_same<SomeIteratorType,CharType*>::value | |
65 | || is_same<SomeIteratorType,CharType const*>::value | |
66 | || is_same<SomeIteratorType,typename std::basic_string<CharType>::iterator>::value | |
67 | || is_same<SomeIteratorType,typename std::basic_string<CharType>::const_iterator>::value | |
68 | || is_same<SomeIteratorType,typename std::vector<CharType>::iterator>::value | |
69 | || is_same<SomeIteratorType,typename std::vector<CharType>::const_iterator>::value | |
70 | ; | |
71 | }; | |
72 | ||
73 | ||
74 | ||
75 | template<typename IteratorType> | |
76 | struct mapping_traits<IteratorType,std::random_access_iterator_tag> { | |
77 | ||
78 | typedef typename std::iterator_traits<IteratorType>::value_type char_type; | |
79 | ||
80 | ||
81 | ||
82 | static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) | |
83 | { | |
84 | index_type result; | |
85 | ||
86 | // | |
87 | // Optimize for most common cases | |
88 | // | |
89 | // C++0x requires that string is continious in memory and all known | |
90 | // string implementations | |
91 | // do this because of c_str() support. | |
92 | // | |
93 | ||
94 | if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e) | |
95 | { | |
96 | char_type const *begin = &*b; | |
97 | char_type const *end = begin + (e-b); | |
98 | index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end); | |
99 | result.swap(tmp); | |
100 | } | |
101 | else { | |
102 | std::basic_string<char_type> str(b,e); | |
103 | index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size()); | |
104 | result.swap(tmp); | |
105 | } | |
106 | return result; | |
107 | } | |
108 | }; | |
109 | ||
110 | template<typename BaseIterator> | |
111 | class mapping { | |
112 | public: | |
113 | typedef BaseIterator base_iterator; | |
114 | typedef typename std::iterator_traits<base_iterator>::value_type char_type; | |
115 | ||
116 | ||
117 | mapping(boundary_type type, | |
118 | base_iterator begin, | |
119 | base_iterator end, | |
120 | std::locale const &loc) | |
121 | : | |
122 | index_(new index_type()), | |
123 | begin_(begin), | |
124 | end_(end) | |
125 | { | |
126 | index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc); | |
127 | index_->swap(idx); | |
128 | } | |
129 | ||
130 | mapping() | |
131 | { | |
132 | } | |
133 | ||
134 | index_type const &index() const | |
135 | { | |
136 | return *index_; | |
137 | } | |
138 | ||
139 | base_iterator begin() const | |
140 | { | |
141 | return begin_; | |
142 | } | |
143 | ||
144 | base_iterator end() const | |
145 | { | |
146 | return end_; | |
147 | } | |
148 | ||
149 | private: | |
150 | boost::shared_ptr<index_type> index_; | |
151 | base_iterator begin_,end_; | |
152 | }; | |
153 | ||
154 | template<typename BaseIterator> | |
155 | class segment_index_iterator : | |
156 | public boost::iterator_facade< | |
157 | segment_index_iterator<BaseIterator>, | |
158 | segment<BaseIterator>, | |
159 | boost::bidirectional_traversal_tag, | |
160 | segment<BaseIterator> const & | |
161 | > | |
162 | { | |
163 | public: | |
164 | typedef BaseIterator base_iterator; | |
165 | typedef mapping<base_iterator> mapping_type; | |
166 | typedef segment<base_iterator> segment_type; | |
167 | ||
168 | segment_index_iterator() : current_(0,0),map_(0) | |
169 | { | |
170 | } | |
171 | ||
172 | segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) : | |
173 | map_(map), | |
174 | mask_(mask), | |
175 | full_select_(full_select) | |
176 | { | |
177 | set(p); | |
178 | } | |
179 | segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) : | |
180 | map_(map), | |
181 | mask_(mask), | |
182 | full_select_(full_select) | |
183 | { | |
184 | if(is_begin) | |
185 | set_begin(); | |
186 | else | |
187 | set_end(); | |
188 | } | |
189 | ||
190 | segment_type const &dereference() const | |
191 | { | |
192 | return value_; | |
193 | } | |
194 | ||
195 | bool equal(segment_index_iterator const &other) const | |
196 | { | |
197 | return map_ == other.map_ && current_.second == other.current_.second; | |
198 | } | |
199 | ||
200 | void increment() | |
201 | { | |
202 | std::pair<size_t,size_t> next = current_; | |
203 | if(full_select_) { | |
204 | next.first = next.second; | |
205 | while(next.second < size()) { | |
206 | next.second++; | |
207 | if(valid_offset(next.second)) | |
208 | break; | |
209 | } | |
210 | if(next.second == size()) | |
211 | next.first = next.second - 1; | |
212 | } | |
213 | else { | |
214 | while(next.second < size()) { | |
215 | next.first = next.second; | |
216 | next.second++; | |
217 | if(valid_offset(next.second)) | |
218 | break; | |
219 | } | |
220 | } | |
221 | update_current(next); | |
222 | } | |
223 | ||
224 | void decrement() | |
225 | { | |
226 | std::pair<size_t,size_t> next = current_; | |
227 | if(full_select_) { | |
228 | while(next.second >1) { | |
229 | next.second--; | |
230 | if(valid_offset(next.second)) | |
231 | break; | |
232 | } | |
233 | next.first = next.second; | |
234 | while(next.first >0) { | |
235 | next.first--; | |
236 | if(valid_offset(next.first)) | |
237 | break; | |
238 | } | |
239 | } | |
240 | else { | |
241 | while(next.second >1) { | |
242 | next.second--; | |
243 | if(valid_offset(next.second)) | |
244 | break; | |
245 | } | |
246 | next.first = next.second - 1; | |
247 | } | |
248 | update_current(next); | |
249 | } | |
250 | ||
251 | private: | |
252 | ||
253 | void set_end() | |
254 | { | |
255 | current_.first = size() - 1; | |
256 | current_.second = size(); | |
257 | value_ = segment_type(map_->end(),map_->end(),0); | |
258 | } | |
259 | void set_begin() | |
260 | { | |
261 | current_.first = current_.second = 0; | |
262 | value_ = segment_type(map_->begin(),map_->begin(),0); | |
263 | increment(); | |
264 | } | |
265 | ||
266 | void set(base_iterator p) | |
267 | { | |
268 | size_t dist=std::distance(map_->begin(),p); | |
269 | index_type::const_iterator b=map_->index().begin(),e=map_->index().end(); | |
270 | index_type::const_iterator | |
271 | boundary_point=std::upper_bound(b,e,break_info(dist)); | |
272 | while(boundary_point != e && (boundary_point->rule & mask_)==0) | |
273 | boundary_point++; | |
274 | ||
275 | current_.first = current_.second = boundary_point - b; | |
276 | ||
277 | if(full_select_) { | |
278 | while(current_.first > 0) { | |
279 | current_.first --; | |
280 | if(valid_offset(current_.first)) | |
281 | break; | |
282 | } | |
283 | } | |
284 | else { | |
285 | if(current_.first > 0) | |
286 | current_.first --; | |
287 | } | |
288 | value_.first = map_->begin(); | |
289 | std::advance(value_.first,get_offset(current_.first)); | |
290 | value_.second = value_.first; | |
291 | std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first)); | |
292 | ||
293 | update_rule(); | |
294 | } | |
295 | ||
296 | void update_current(std::pair<size_t,size_t> pos) | |
297 | { | |
298 | std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first); | |
299 | std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second); | |
300 | std::advance(value_.first,first_diff); | |
301 | std::advance(value_.second,second_diff); | |
302 | current_ = pos; | |
303 | update_rule(); | |
304 | } | |
305 | ||
306 | void update_rule() | |
307 | { | |
308 | if(current_.second != size()) { | |
309 | value_.rule(index()[current_.second].rule); | |
310 | } | |
311 | } | |
312 | size_t get_offset(size_t ind) const | |
313 | { | |
314 | if(ind == size()) | |
315 | return index().back().offset; | |
316 | return index()[ind].offset; | |
317 | } | |
318 | ||
319 | bool valid_offset(size_t offset) const | |
320 | { | |
321 | return offset == 0 | |
322 | || offset == size() // make sure we not acess index[size] | |
323 | || (index()[offset].rule & mask_)!=0; | |
324 | } | |
325 | ||
326 | size_t size() const | |
327 | { | |
328 | return index().size(); | |
329 | } | |
330 | ||
331 | index_type const &index() const | |
332 | { | |
333 | return map_->index(); | |
334 | } | |
335 | ||
336 | ||
337 | segment_type value_; | |
338 | std::pair<size_t,size_t> current_; | |
339 | mapping_type const *map_; | |
340 | rule_type mask_; | |
341 | bool full_select_; | |
342 | }; | |
343 | ||
344 | template<typename BaseIterator> | |
345 | class boundary_point_index_iterator : | |
346 | public boost::iterator_facade< | |
347 | boundary_point_index_iterator<BaseIterator>, | |
348 | boundary_point<BaseIterator>, | |
349 | boost::bidirectional_traversal_tag, | |
350 | boundary_point<BaseIterator> const & | |
351 | > | |
352 | { | |
353 | public: | |
354 | typedef BaseIterator base_iterator; | |
355 | typedef mapping<base_iterator> mapping_type; | |
356 | typedef boundary_point<base_iterator> boundary_point_type; | |
357 | ||
358 | boundary_point_index_iterator() : current_(0),map_(0) | |
359 | { | |
360 | } | |
361 | ||
362 | boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) : | |
363 | map_(map), | |
364 | mask_(mask) | |
365 | { | |
366 | if(is_begin) | |
367 | set_begin(); | |
368 | else | |
369 | set_end(); | |
370 | } | |
371 | boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) : | |
372 | map_(map), | |
373 | mask_(mask) | |
374 | { | |
375 | set(p); | |
376 | } | |
377 | ||
378 | boundary_point_type const &dereference() const | |
379 | { | |
380 | return value_; | |
381 | } | |
382 | ||
383 | bool equal(boundary_point_index_iterator const &other) const | |
384 | { | |
385 | return map_ == other.map_ && current_ == other.current_; | |
386 | } | |
387 | ||
388 | void increment() | |
389 | { | |
390 | size_t next = current_; | |
391 | while(next < size()) { | |
392 | next++; | |
393 | if(valid_offset(next)) | |
394 | break; | |
395 | } | |
396 | update_current(next); | |
397 | } | |
398 | ||
399 | void decrement() | |
400 | { | |
401 | size_t next = current_; | |
402 | while(next>0) { | |
403 | next--; | |
404 | if(valid_offset(next)) | |
405 | break; | |
406 | } | |
407 | update_current(next); | |
408 | } | |
409 | ||
410 | private: | |
411 | void set_end() | |
412 | { | |
413 | current_ = size(); | |
414 | value_ = boundary_point_type(map_->end(),0); | |
415 | } | |
416 | void set_begin() | |
417 | { | |
418 | current_ = 0; | |
419 | value_ = boundary_point_type(map_->begin(),0); | |
420 | } | |
421 | ||
422 | void set(base_iterator p) | |
423 | { | |
424 | size_t dist = std::distance(map_->begin(),p); | |
425 | ||
426 | index_type::const_iterator b=index().begin(); | |
427 | index_type::const_iterator e=index().end(); | |
428 | index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist)); | |
429 | ||
430 | if(ptr==index().end()) | |
431 | current_=size()-1; | |
432 | else | |
433 | current_=ptr - index().begin(); | |
434 | ||
435 | while(!valid_offset(current_)) | |
436 | current_ ++; | |
437 | ||
438 | std::ptrdiff_t diff = get_offset(current_) - dist; | |
439 | std::advance(p,diff); | |
440 | value_.iterator(p); | |
441 | update_rule(); | |
442 | } | |
443 | ||
444 | void update_current(size_t pos) | |
445 | { | |
446 | std::ptrdiff_t diff = get_offset(pos) - get_offset(current_); | |
447 | base_iterator i=value_.iterator(); | |
448 | std::advance(i,diff); | |
449 | current_ = pos; | |
450 | value_.iterator(i); | |
451 | update_rule(); | |
452 | } | |
453 | ||
454 | void update_rule() | |
455 | { | |
456 | if(current_ != size()) { | |
457 | value_.rule(index()[current_].rule); | |
458 | } | |
459 | } | |
460 | size_t get_offset(size_t ind) const | |
461 | { | |
462 | if(ind == size()) | |
463 | return index().back().offset; | |
464 | return index()[ind].offset; | |
465 | } | |
466 | ||
467 | bool valid_offset(size_t offset) const | |
468 | { | |
469 | return offset == 0 | |
470 | || offset + 1 >= size() // last and first are always valid regardless of mark | |
471 | || (index()[offset].rule & mask_)!=0; | |
472 | } | |
473 | ||
474 | size_t size() const | |
475 | { | |
476 | return index().size(); | |
477 | } | |
478 | ||
479 | index_type const &index() const | |
480 | { | |
481 | return map_->index(); | |
482 | } | |
483 | ||
484 | ||
485 | boundary_point_type value_; | |
486 | size_t current_; | |
487 | mapping_type const *map_; | |
488 | rule_type mask_; | |
489 | }; | |
490 | ||
491 | ||
492 | } // details | |
493 | ||
494 | /// \endcond | |
495 | ||
496 | template<typename BaseIterator> | |
497 | class segment_index; | |
498 | ||
499 | template<typename BaseIterator> | |
500 | class boundary_point_index; | |
501 | ||
502 | ||
503 | /// | |
504 | /// \brief This class holds an index of segments in the text range and allows to iterate over them | |
505 | /// | |
506 | /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators | |
507 | /// to the \ref segment objects. | |
508 | /// | |
509 | /// It provides two options on way of selecting segments: | |
510 | /// | |
511 | /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to | |
512 | /// various masks %as \ref word_any. | |
513 | /// \n | |
514 | /// The default is to select any types of boundaries. | |
515 | /// \n | |
516 | /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators | |
517 | /// would iterate only over the words containing Kana letters and \ref word_any would select all types of | |
518 | /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text | |
519 | /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead | |
520 | /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". | |
521 | /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous | |
522 | /// %boundary point does not fit the selected rule. | |
523 | /// \n | |
524 | /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". | |
525 | /// \n | |
526 | /// This text contains three %boundary points separating it to sentences by different rules: | |
527 | /// - The exclamation mark "!" ends the sentence "Hello!" | |
528 | /// - The line feed that splits the sentence "How\nare you?" into two parts. | |
529 | /// - The question mark that ends the second sentence. | |
530 | /// \n | |
531 | /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would | |
532 | /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required | |
533 | /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include | |
534 | /// all the text up to previous valid %boundary point and would return two expected sentences: | |
535 | /// "Hello!" and "How\nare you?". | |
536 | /// | |
537 | /// This class allows to find a segment according to the given iterator in range using \ref find() member | |
538 | /// function. | |
539 | /// | |
540 | /// \note | |
541 | /// | |
542 | /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text | |
543 | /// invalidates existing iterators and they can't be used any more. | |
544 | /// - segment_index can be created from boundary_point_index or other segment_index that was created with | |
545 | /// same \ref boundary_type. This is very fast operation %as they shared same index | |
546 | /// and it does not require its regeneration. | |
547 | /// | |
548 | /// \see | |
549 | /// | |
550 | /// - \ref boundary_point_index | |
551 | /// - \ref segment | |
552 | /// - \ref boundary_point | |
553 | /// | |
554 | ||
555 | template<typename BaseIterator> | |
556 | class segment_index { | |
557 | public: | |
558 | ||
559 | /// | |
560 | /// The type of the iterator used to iterate over the original text | |
561 | /// | |
562 | typedef BaseIterator base_iterator; | |
563 | #ifdef BOOST_LOCALE_DOXYGEN | |
564 | /// | |
565 | /// The bidirectional iterator that iterates over \ref value_type objects. | |
566 | /// | |
567 | /// - The iterators may be invalidated by use of any non-const member function | |
568 | /// including but not limited to \ref rule(rule_type) and \ref full_select(bool). | |
569 | /// - The returned value_type object is valid %as long %as iterator points to it. | |
570 | /// So this following code is wrong %as t used after p was updated: | |
571 | /// \code | |
572 | /// segment_index<some_iterator>::iterator p=index.begin(); | |
573 | /// segment<some_iterator> &t = *p; | |
574 | /// ++p; | |
575 | /// cout << t.str() << endl; | |
576 | /// \endcode | |
577 | /// | |
578 | typedef unspecified_iterator_type iterator; | |
579 | /// | |
580 | /// \copydoc iterator | |
581 | /// | |
582 | typedef unspecified_iterator_type const_iterator; | |
583 | #else | |
584 | typedef details::segment_index_iterator<base_iterator> iterator; | |
585 | typedef details::segment_index_iterator<base_iterator> const_iterator; | |
586 | #endif | |
587 | /// | |
588 | /// The type dereferenced by the \ref iterator and \ref const_iterator. It is | |
589 | /// an object that represents selected segment. | |
590 | /// | |
591 | typedef segment<base_iterator> value_type; | |
592 | ||
593 | /// | |
594 | /// Default constructor. | |
595 | /// | |
596 | /// \note | |
597 | /// | |
598 | /// When this object is constructed by default it does not include a valid index, thus | |
599 | /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined | |
600 | /// behavior | |
601 | /// | |
602 | segment_index() : mask_(0xFFFFFFFFu),full_select_(false) | |
603 | { | |
604 | } | |
605 | /// | |
606 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text | |
607 | /// in range [begin,end) using a rule \a mask for locale \a loc. | |
608 | /// | |
609 | segment_index(boundary_type type, | |
610 | base_iterator begin, | |
611 | base_iterator end, | |
612 | rule_type mask, | |
613 | std::locale const &loc=std::locale()) | |
614 | : | |
615 | map_(type,begin,end,loc), | |
616 | mask_(mask), | |
617 | full_select_(false) | |
618 | { | |
619 | } | |
620 | /// | |
621 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text | |
622 | /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. | |
623 | /// | |
624 | segment_index(boundary_type type, | |
625 | base_iterator begin, | |
626 | base_iterator end, | |
627 | std::locale const &loc=std::locale()) | |
628 | : | |
629 | map_(type,begin,end,loc), | |
630 | mask_(0xFFFFFFFFu), | |
631 | full_select_(false) | |
632 | { | |
633 | } | |
634 | ||
635 | /// | |
636 | /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information | |
637 | /// and used default rule (all possible segments) | |
638 | /// | |
639 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text | |
640 | /// range it is much better to create one from another rather then indexing the same | |
641 | /// range twice. | |
642 | /// | |
643 | /// \note \ref rule() flags are not copied | |
644 | /// | |
645 | segment_index(boundary_point_index<base_iterator> const &); | |
646 | /// | |
647 | /// Copy an index from a \ref boundary_point_index. It copies all indexing information | |
648 | /// and uses the default rule (all possible segments) | |
649 | /// | |
650 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text | |
651 | /// range it is much better to create one from another rather then indexing the same | |
652 | /// range twice. | |
653 | /// | |
654 | /// \note \ref rule() flags are not copied | |
655 | /// | |
656 | segment_index const &operator = (boundary_point_index<base_iterator> const &); | |
657 | ||
658 | ||
659 | /// | |
660 | /// Create a new index for %boundary analysis \ref boundary_type "type" of the text | |
661 | /// in range [begin,end) for locale \a loc. | |
662 | /// | |
663 | /// \note \ref rule() and \ref full_select() remain unchanged. | |
664 | /// | |
665 | void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) | |
666 | { | |
667 | map_ = mapping_type(type,begin,end,loc); | |
668 | } | |
669 | ||
670 | /// | |
671 | /// Get the \ref iterator on the beginning of the segments range. | |
672 | /// | |
673 | /// Preconditions: the segment_index should have a mapping | |
674 | /// | |
675 | /// \note | |
676 | /// | |
677 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
678 | /// | |
679 | iterator begin() const | |
680 | { | |
681 | return iterator(true,&map_,mask_,full_select_); | |
682 | } | |
683 | ||
684 | /// | |
685 | /// Get the \ref iterator on the ending of the segments range. | |
686 | /// | |
687 | /// Preconditions: the segment_index should have a mapping | |
688 | /// | |
689 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
690 | /// | |
691 | iterator end() const | |
692 | { | |
693 | return iterator(false,&map_,mask_,full_select_); | |
694 | } | |
695 | ||
696 | /// | |
697 | /// Find a first valid segment following a position \a p. | |
698 | /// | |
699 | /// If \a p is inside a valid segment this segment is selected: | |
700 | /// | |
701 | /// For example: For \ref word %boundary analysis with \ref word_any rule(): | |
702 | /// | |
703 | /// - "to| be or ", would point to "be", | |
704 | /// - "t|o be or ", would point to "to", | |
705 | /// - "to be or| ", would point to end. | |
706 | /// | |
707 | /// | |
708 | /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator | |
709 | /// to the text in the mapped range. | |
710 | /// | |
711 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
712 | /// | |
713 | iterator find(base_iterator p) const | |
714 | { | |
715 | return iterator(p,&map_,mask_,full_select_); | |
716 | } | |
717 | ||
718 | /// | |
719 | /// Get the mask of rules that are used | |
720 | /// | |
721 | rule_type rule() const | |
722 | { | |
723 | return mask_; | |
724 | } | |
725 | /// | |
726 | /// Set the mask of rules that are used | |
727 | /// | |
728 | void rule(rule_type v) | |
729 | { | |
730 | mask_ = v; | |
731 | } | |
732 | ||
733 | /// | |
734 | /// Get the full_select property value - should segment include in the range | |
735 | /// values that not belong to specific \ref rule() or not. | |
736 | /// | |
737 | /// The default value is false. | |
738 | /// | |
739 | /// For example for \ref sentence %boundary with rule \ref sentence_term the segments | |
740 | /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false | |
741 | /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() | |
742 | /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the | |
743 | /// following part "are you?" | |
744 | /// | |
745 | ||
746 | bool full_select() const | |
747 | { | |
748 | return full_select_; | |
749 | } | |
750 | ||
751 | /// | |
752 | /// Set the full_select property value - should segment include in the range | |
753 | /// values that not belong to specific \ref rule() or not. | |
754 | /// | |
755 | /// The default value is false. | |
756 | /// | |
757 | /// For example for \ref sentence %boundary with rule \ref sentence_term the segments | |
758 | /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false | |
759 | /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() | |
760 | /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the | |
761 | /// following part "are you?" | |
762 | /// | |
763 | ||
764 | void full_select(bool v) | |
765 | { | |
766 | full_select_ = v; | |
767 | } | |
768 | ||
769 | private: | |
770 | friend class boundary_point_index<base_iterator>; | |
771 | typedef details::mapping<base_iterator> mapping_type; | |
772 | mapping_type map_; | |
773 | rule_type mask_; | |
774 | bool full_select_; | |
775 | }; | |
776 | ||
777 | /// | |
778 | /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating | |
779 | /// over them. | |
780 | /// | |
781 | /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators | |
782 | /// to the \ref boundary_point objects. | |
783 | /// | |
784 | /// It provides an option that affects selecting %boundary points according to different rules: | |
785 | /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific | |
786 | /// types of %boundary points like \ref sentence_term. | |
787 | /// | |
788 | /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default | |
789 | /// rule is used the %boundary points would be: | |
790 | /// | |
791 | /// - "|Hello! How\nare you?" | |
792 | /// - "Hello! |How\nare you?" | |
793 | /// - "Hello! How\n|are you?" | |
794 | /// - "Hello! How\nare you?|" | |
795 | /// | |
796 | /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: | |
797 | /// | |
798 | /// - "|Hello! How\nare you?" | |
799 | /// - "Hello! |How\nare you?" | |
800 | /// - "Hello! How\nare you?|" | |
801 | /// | |
802 | /// Such that a %boundary point defined by a line feed character would be ignored. | |
803 | /// | |
804 | /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member | |
805 | /// function. | |
806 | /// | |
807 | /// \note | |
808 | /// - Even an empty text range [x,x) considered to have a one %boundary point x. | |
809 | /// - \a a and \a b points of the range [a,b) are always considered %boundary points | |
810 | /// regardless the rules used. | |
811 | /// - Changing any of the option \ref rule() or course re-indexing the text | |
812 | /// invalidates existing iterators and they can't be used any more. | |
813 | /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with | |
814 | /// same \ref boundary_type. This is very fast operation %as they shared same index | |
815 | /// and it does not require its regeneration. | |
816 | /// | |
817 | /// \see | |
818 | /// | |
819 | /// - \ref segment_index | |
820 | /// - \ref boundary_point | |
821 | /// - \ref segment | |
822 | /// | |
823 | ||
824 | ||
825 | template<typename BaseIterator> | |
826 | class boundary_point_index { | |
827 | public: | |
828 | /// | |
829 | /// The type of the iterator used to iterate over the original text | |
830 | /// | |
831 | typedef BaseIterator base_iterator; | |
832 | #ifdef BOOST_LOCALE_DOXYGEN | |
833 | /// | |
834 | /// The bidirectional iterator that iterates over \ref value_type objects. | |
835 | /// | |
836 | /// - The iterators may be invalidated by use of any non-const member function | |
837 | /// including but not limited to \ref rule(rule_type) member function. | |
838 | /// - The returned value_type object is valid %as long %as iterator points to it. | |
839 | /// So this following code is wrong %as t used after p was updated: | |
840 | /// \code | |
841 | /// boundary_point_index<some_iterator>::iterator p=index.begin(); | |
842 | /// boundary_point<some_iterator> &t = *p; | |
843 | /// ++p; | |
844 | /// rule_type r = t->rule(); | |
845 | /// \endcode | |
846 | /// | |
847 | typedef unspecified_iterator_type iterator; | |
848 | /// | |
849 | /// \copydoc iterator | |
850 | /// | |
851 | typedef unspecified_iterator_type const_iterator; | |
852 | #else | |
853 | typedef details::boundary_point_index_iterator<base_iterator> iterator; | |
854 | typedef details::boundary_point_index_iterator<base_iterator> const_iterator; | |
855 | #endif | |
856 | /// | |
857 | /// The type dereferenced by the \ref iterator and \ref const_iterator. It is | |
858 | /// an object that represents the selected \ref boundary_point "boundary point". | |
859 | /// | |
860 | typedef boundary_point<base_iterator> value_type; | |
861 | ||
862 | /// | |
863 | /// Default constructor. | |
864 | /// | |
865 | /// \note | |
866 | /// | |
867 | /// When this object is constructed by default it does not include a valid index, thus | |
868 | /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined | |
869 | /// behavior | |
870 | /// | |
871 | boundary_point_index() : mask_(0xFFFFFFFFu) | |
872 | { | |
873 | } | |
874 | ||
875 | /// | |
876 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text | |
877 | /// in range [begin,end) using a rule \a mask for locale \a loc. | |
878 | /// | |
879 | boundary_point_index(boundary_type type, | |
880 | base_iterator begin, | |
881 | base_iterator end, | |
882 | rule_type mask, | |
883 | std::locale const &loc=std::locale()) | |
884 | : | |
885 | map_(type,begin,end,loc), | |
886 | mask_(mask) | |
887 | { | |
888 | } | |
889 | /// | |
890 | /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text | |
891 | /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. | |
892 | /// | |
893 | boundary_point_index(boundary_type type, | |
894 | base_iterator begin, | |
895 | base_iterator end, | |
896 | std::locale const &loc=std::locale()) | |
897 | : | |
898 | map_(type,begin,end,loc), | |
899 | mask_(0xFFFFFFFFu) | |
900 | { | |
901 | } | |
902 | ||
903 | /// | |
904 | /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information | |
905 | /// and uses the default rule (all possible %boundary points) | |
906 | /// | |
907 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text | |
908 | /// range it is much better to create one from another rather then indexing the same | |
909 | /// range twice. | |
910 | /// | |
911 | /// \note \ref rule() flags are not copied | |
912 | /// | |
913 | boundary_point_index(segment_index<base_iterator> const &other); | |
914 | /// | |
915 | /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information | |
916 | /// and keeps the current \ref rule() unchanged | |
917 | /// | |
918 | /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text | |
919 | /// range it is much better to create one from another rather then indexing the same | |
920 | /// range twice. | |
921 | /// | |
922 | /// \note \ref rule() flags are not copied | |
923 | /// | |
924 | boundary_point_index const &operator=(segment_index<base_iterator> const &other); | |
925 | ||
926 | /// | |
927 | /// Create a new index for %boundary analysis \ref boundary_type "type" of the text | |
928 | /// in range [begin,end) for locale \a loc. | |
929 | /// | |
930 | /// \note \ref rule() remains unchanged. | |
931 | /// | |
932 | void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) | |
933 | { | |
934 | map_ = mapping_type(type,begin,end,loc); | |
935 | } | |
936 | ||
937 | /// | |
938 | /// Get the \ref iterator on the beginning of the %boundary points range. | |
939 | /// | |
940 | /// Preconditions: this boundary_point_index should have a mapping | |
941 | /// | |
942 | /// \note | |
943 | /// | |
944 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
945 | /// | |
946 | iterator begin() const | |
947 | { | |
948 | return iterator(true,&map_,mask_); | |
949 | } | |
950 | ||
951 | /// | |
952 | /// Get the \ref iterator on the ending of the %boundary points range. | |
953 | /// | |
954 | /// Preconditions: this boundary_point_index should have a mapping | |
955 | /// | |
956 | /// \note | |
957 | /// | |
958 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
959 | /// | |
960 | iterator end() const | |
961 | { | |
962 | return iterator(false,&map_,mask_); | |
963 | } | |
964 | ||
965 | /// | |
966 | /// Find a first valid %boundary point on a position \a p or following it. | |
967 | /// | |
968 | /// For example: For \ref word %boundary analysis of the text "to be or" | |
969 | /// | |
970 | /// - "|to be", would return %boundary point at "|to be", | |
971 | /// - "t|o be", would point to "to| be" | |
972 | /// | |
973 | /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator | |
974 | /// to the text in the mapped range. | |
975 | /// | |
976 | /// The returned iterator is invalidated by access to any non-const member functions of this object | |
977 | /// | |
978 | iterator find(base_iterator p) const | |
979 | { | |
980 | return iterator(p,&map_,mask_); | |
981 | } | |
982 | ||
983 | /// | |
984 | /// Get the mask of rules that are used | |
985 | /// | |
986 | rule_type rule() const | |
987 | { | |
988 | return mask_; | |
989 | } | |
990 | /// | |
991 | /// Set the mask of rules that are used | |
992 | /// | |
993 | void rule(rule_type v) | |
994 | { | |
995 | mask_ = v; | |
996 | } | |
997 | ||
998 | private: | |
999 | ||
1000 | friend class segment_index<base_iterator>; | |
1001 | typedef details::mapping<base_iterator> mapping_type; | |
1002 | mapping_type map_; | |
1003 | rule_type mask_; | |
1004 | }; | |
1005 | ||
1006 | /// \cond INTERNAL | |
1007 | template<typename BaseIterator> | |
1008 | segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) : | |
1009 | map_(other.map_), | |
1010 | mask_(0xFFFFFFFFu), | |
1011 | full_select_(false) | |
1012 | { | |
1013 | } | |
1014 | ||
1015 | template<typename BaseIterator> | |
1016 | boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) : | |
1017 | map_(other.map_), | |
1018 | mask_(0xFFFFFFFFu) | |
1019 | { | |
1020 | } | |
1021 | ||
1022 | template<typename BaseIterator> | |
1023 | segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other) | |
1024 | { | |
1025 | map_ = other.map_; | |
1026 | return *this; | |
1027 | } | |
1028 | ||
1029 | template<typename BaseIterator> | |
1030 | boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other) | |
1031 | { | |
1032 | map_ = other.map_; | |
1033 | return *this; | |
1034 | } | |
1035 | /// \endcond | |
1036 | ||
1037 | typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef | |
1038 | typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef | |
1039 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T | |
1040 | typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef | |
1041 | #endif | |
1042 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T | |
1043 | typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef | |
1044 | #endif | |
1045 | ||
1046 | typedef segment_index<char const *> csegment_index; ///< convenience typedef | |
1047 | typedef segment_index<wchar_t const *> wcsegment_index; ///< convenience typedef | |
1048 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T | |
1049 | typedef segment_index<char16_t const *> u16csegment_index; ///< convenience typedef | |
1050 | #endif | |
1051 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T | |
1052 | typedef segment_index<char32_t const *> u32csegment_index; ///< convenience typedef | |
1053 | #endif | |
1054 | ||
1055 | typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef | |
1056 | typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef | |
1057 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T | |
1058 | typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef | |
1059 | #endif | |
1060 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T | |
1061 | typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef | |
1062 | #endif | |
1063 | ||
1064 | typedef boundary_point_index<char const *> cboundary_point_index; ///< convenience typedef | |
1065 | typedef boundary_point_index<wchar_t const *> wcboundary_point_index; ///< convenience typedef | |
1066 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T | |
1067 | typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef | |
1068 | #endif | |
1069 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T | |
1070 | typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef | |
1071 | #endif | |
1072 | ||
1073 | ||
1074 | ||
1075 | } // boundary | |
1076 | ||
1077 | } // locale | |
1078 | } // boost | |
1079 | ||
1080 | /// | |
1081 | /// \example boundary.cpp | |
1082 | /// Example of using segment_index | |
1083 | /// \example wboundary.cpp | |
1084 | /// Example of using segment_index over wide strings | |
1085 | /// | |
1086 | ||
1087 | #ifdef BOOST_MSVC | |
1088 | #pragma warning(pop) | |
1089 | #endif | |
1090 | ||
1091 | #endif | |
1092 | // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 |