2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
8 #ifndef BOOST_LOCALE_BOUNDARY_TYPES_HPP_INCLUDED
9 #define BOOST_LOCALE_BOUNDARY_TYPES_HPP_INCLUDED
11 #include <boost/locale/config.hpp>
12 #include <boost/cstdint.hpp>
13 #include <boost/assert.hpp>
15 # pragma warning(push)
16 # pragma warning(disable : 4275 4251 4231 4660)
25 /// \brief This namespase contains all operations required for boundary analysis of text
29 /// \defgroup boundary Boundary Analysis
31 /// This module contains all operations required for boundary analysis of text: character, word, like and sentence boundaries
37 /// This type describes a possible boundary analysis alternatives.
40 character, ///< Analyse the text for character boundaries
41 word, ///< Analyse the text for word boundaries
42 sentence, ///< Analyse the text for Find sentence boundaries
43 line ///< Analyse the text for positions suitable for line breaks
47 /// \brief Flags used with word boundary analysis -- the type of the word, line or sentence boundary found.
49 /// It is a bit-mask that represents various combinations of rules used to select this specific boundary.
51 typedef uint32_t rule_type;
54 /// \anchor bl_boundary_word_rules
55 /// \name Flags that describe a type of word selected
57 static const rule_type
58 word_none = 0x0000F, ///< Not a word, like white space or punctuation mark
59 word_number = 0x000F0, ///< Word that appear to be a number
60 word_letter = 0x00F00, ///< Word that contains letters, excluding kana and ideographic characters
61 word_kana = 0x0F000, ///< Word that contains kana characters
62 word_ideo = 0xF0000, ///< Word that contains ideographic characters
63 word_any = 0xFFFF0, ///< Any word including numbers, 0 is special flag, equivalent to 15
64 word_letters = 0xFFF00, ///< Any word, excluding numbers but including letters, kana and ideograms.
65 word_kana_ideo = 0xFF000, ///< Word that includes kana or ideographic characters
66 word_mask = 0xFFFFF; ///< Full word mask - select all possible variants
70 /// \anchor bl_boundary_line_rules
71 /// \name Flags that describe a type of line break
73 static const rule_type
74 line_soft = 0x0F, ///< Soft line break: optional but not required
75 line_hard = 0xF0, ///< Hard line break: like break is required (as per CR/LF)
76 line_any = 0xFF, ///< Soft or Hard line break
77 line_mask = 0xFF; ///< Select all types of line breaks
82 /// \anchor bl_boundary_sentence_rules
83 /// \name Flags that describe a type of sentence break
86 static const rule_type
87 sentence_term = 0x0F, ///< \brief The sentence was terminated with a sentence terminator
88 /// like ".", "!" possible followed by hard separator like CR, LF, PS
89 sentence_sep = 0xF0, ///< \brief The sentence does not contain terminator like ".", "!" but ended with hard separator
90 /// like CR, LF, PS or end of input.
91 sentence_any = 0xFF, ///< Either first or second sentence break type;.
92 sentence_mask = 0xFF; ///< Select all sentence breaking points
97 /// \name Flags that describe a type of character break.
99 /// At this point break iterator does not distinguish different
100 /// kinds of characters so it is used for consistency.
102 static const rule_type
103 character_any = 0xF, ///< Not in use, just for consistency
104 character_mask = 0xF; ///< Select all character breaking points
109 /// This function returns the mask that covers all variants for specific boundary type
111 inline rule_type boundary_rule(boundary_type t)
114 case character: return character_mask;
115 case word: return word_mask;
116 case sentence: return sentence_mask;
117 case line: return line_mask;
136 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4