1 //! Functionality for finding words.
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core
::skip_ansi_escape_sequence
;
19 use crate::core
::Word
;
21 /// Describes where words occur in a line of text.
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
28 /// The line breaks occur between words, please see
29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30 /// hyphenation of individual words.
35 /// use textwrap::core::Word;
36 /// use textwrap::WordSeparator::AsciiSpace;
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
41 #[derive(Clone, Copy)]
42 pub enum WordSeparator
{
43 /// Find words by splitting on runs of `' '` characters.
48 /// use textwrap::core::Word;
49 /// use textwrap::WordSeparator::AsciiSpace;
51 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
52 /// assert_eq!(words, vec![Word::from("Hello "),
53 /// Word::from("World!")]);
57 /// Split `line` into words using Unicode break properties.
59 /// This word separator uses the Unicode line breaking algorithm
60 /// described in [Unicode Standard Annex
61 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62 /// to break lines. There is a small difference in that the U+002D
63 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) donβt create a line break:
64 /// to allow a line break at a hyphen, use
65 /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66 /// Soft hyphens are not currently supported.
70 /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71 /// breaking algorithm will find line break opportunities between
72 /// some characters with no intervening whitespace:
75 /// #[cfg(feature = "unicode-linebreak")] {
76 /// use textwrap::core::Word;
77 /// use textwrap::WordSeparator::UnicodeBreakProperties;
79 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ππ").collect::<Vec<_>>(),
80 /// vec![Word::from("Emojis: "),
81 /// Word::from("π"),
82 /// Word::from("π")]);
84 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
85 /// vec![Word::from("CJK: "),
86 /// Word::from("δ½ "),
87 /// Word::from("ε₯½")]);
91 /// A U+2060 (Word Joiner) character can be inserted if you want to
92 /// manually override the defaults and keep the characters together:
95 /// #[cfg(feature = "unicode-linebreak")] {
96 /// use textwrap::core::Word;
97 /// use textwrap::WordSeparator::UnicodeBreakProperties;
99 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: π\u{2060}π").collect::<Vec<_>>(),
100 /// vec![Word::from("Emojis: "),
101 /// Word::from("π\u{2060}π")]);
105 /// The Unicode line breaking algorithm will also automatically
106 /// suppress break breaks around certain punctuation characters::
109 /// #[cfg(feature = "unicode-linebreak")] {
110 /// use textwrap::core::Word;
111 /// use textwrap::WordSeparator::UnicodeBreakProperties;
113 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114 /// vec![Word::from("[ foo ] "),
115 /// Word::from("bar !")]);
118 #[cfg(feature = "unicode-linebreak")]
119 UnicodeBreakProperties
,
121 /// Find words using a custom word separator
122 Custom(fn(line
: &str) -> Box
<dyn Iterator
<Item
= Word
<'_
>> + '_
>),
125 impl PartialEq
for WordSeparator
{
126 /// Compare two word separators.
129 /// use textwrap::WordSeparator;
131 /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
132 /// #[cfg(feature = "unicode-linebreak")] {
133 /// assert_eq!(WordSeparator::UnicodeBreakProperties,
134 /// WordSeparator::UnicodeBreakProperties);
138 /// Note that `WordSeparator::Custom` values never compare equal:
141 /// use textwrap::WordSeparator;
142 /// use textwrap::core::Word;
143 /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
144 /// Box::new(line.split_inclusive(' ').map(Word::from))
146 /// assert_ne!(WordSeparator::Custom(word_separator),
147 /// WordSeparator::Custom(word_separator));
149 fn eq(&self, other
: &Self) -> bool
{
150 match (self, other
) {
151 (WordSeparator
::AsciiSpace
, WordSeparator
::AsciiSpace
) => true,
152 #[cfg(feature = "unicode-linebreak")]
153 (WordSeparator
::UnicodeBreakProperties
, WordSeparator
::UnicodeBreakProperties
) => true,
159 impl std
::fmt
::Debug
for WordSeparator
{
160 fn fmt(&self, f
: &mut std
::fmt
::Formatter
<'_
>) -> std
::fmt
::Result
{
162 WordSeparator
::AsciiSpace
=> f
.write_str("AsciiSpace"),
163 #[cfg(feature = "unicode-linebreak")]
164 WordSeparator
::UnicodeBreakProperties
=> f
.write_str("UnicodeBreakProperties"),
165 WordSeparator
::Custom(_
) => f
.write_str("Custom(...)"),
171 /// Create a new word separator.
173 /// The best available algorithm is used by default, i.e.,
174 /// [`WordSeparator::UnicodeBreakProperties`] if available,
175 /// otherwise [`WordSeparator::AsciiSpace`].
176 pub const fn new() -> Self {
177 #[cfg(feature = "unicode-linebreak")]
179 WordSeparator
::UnicodeBreakProperties
182 #[cfg(not(feature = "unicode-linebreak"))]
184 WordSeparator
::AsciiSpace
188 // This function should really return impl Iterator<Item = Word>, but
189 // this isn't possible until Rust supports higher-kinded types:
190 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
191 /// Find all words in `line`.
192 pub fn find_words
<'a
>(&self, line
: &'a
str) -> Box
<dyn Iterator
<Item
= Word
<'a
>> + 'a
> {
194 WordSeparator
::AsciiSpace
=> find_words_ascii_space(line
),
195 #[cfg(feature = "unicode-linebreak")]
196 WordSeparator
::UnicodeBreakProperties
=> find_words_unicode_break_properties(line
),
197 WordSeparator
::Custom(func
) => func(line
),
202 fn find_words_ascii_space
<'a
>(line
: &'a
str) -> Box
<dyn Iterator
<Item
= Word
<'a
>> + 'a
> {
204 let mut in_whitespace
= false;
205 let mut char_indices
= line
.char_indices();
207 Box
::new(std
::iter
::from_fn(move || {
208 for (idx
, ch
) in char_indices
.by_ref() {
209 if in_whitespace
&& ch
!= ' '
{
210 let word
= Word
::from(&line
[start
..idx
]);
212 in_whitespace
= ch
== ' '
;
216 in_whitespace
= ch
== ' '
;
219 if start
< line
.len() {
220 let word
= Word
::from(&line
[start
..]);
229 // Strip all ANSI escape sequences from `text`.
230 #[cfg(feature = "unicode-linebreak")]
231 fn strip_ansi_escape_sequences(text
: &str) -> String
{
232 let mut result
= String
::with_capacity(text
.len());
234 let mut chars
= text
.chars();
235 while let Some(ch
) = chars
.next() {
236 if skip_ansi_escape_sequence(ch
, &mut chars
) {
245 /// Soft hyphen, also knows as a βshy hyphenβ. Should show up as β-β
246 /// if a line is broken at this point, and otherwise be invisible.
247 /// Textwrap does not currently support breaking words at soft
249 #[cfg(feature = "unicode-linebreak")]
250 const SHY
: char = '
\u{00ad}'
;
252 /// Find words in line. ANSI escape sequences are ignored in `line`.
253 #[cfg(feature = "unicode-linebreak")]
254 fn find_words_unicode_break_properties
<'a
>(
256 ) -> Box
<dyn Iterator
<Item
= Word
<'a
>> + 'a
> {
257 // Construct an iterator over (original index, stripped index)
258 // tuples. We find the Unicode linebreaks on a stripped string,
259 // but we need the original indices so we can form words based on
260 // the original string.
261 let mut last_stripped_idx
= 0;
262 let mut char_indices
= line
.char_indices();
263 let mut idx_map
= std
::iter
::from_fn(move || match char_indices
.next() {
264 Some((orig_idx
, ch
)) => {
265 let stripped_idx
= last_stripped_idx
;
266 if !skip_ansi_escape_sequence(ch
, &mut char_indices
.by_ref().map(|(_
, ch
)| ch
)) {
267 last_stripped_idx
+= ch
.len_utf8();
269 Some((orig_idx
, stripped_idx
))
274 let stripped
= strip_ansi_escape_sequences(line
);
275 let mut opportunities
= unicode_linebreak
::linebreaks(&stripped
)
277 #[allow(clippy::match_like_matches_macro)]
278 match &stripped
[..*idx
].chars().next_back() {
279 // We suppress breaks at β-β since we want to control
280 // this via the WordSplitter.
282 // Soft hyphens are currently not supported since we
283 // require all `Word` fragments to be continuous in
286 // Other breaks should be fine!
293 // Remove final break opportunity, we will add it below using
294 // &line[start..]; This ensures that we correctly include a
295 // trailing ANSI escape sequence.
296 opportunities
.next_back();
299 Box
::new(std
::iter
::from_fn(move || {
300 for (idx
, _
) in opportunities
.by_ref() {
301 if let Some((orig_idx
, _
)) = idx_map
.find(|&(_
, stripped_idx
)| stripped_idx
== idx
) {
302 let word
= Word
::from(&line
[start
..orig_idx
]);
308 if start
< line
.len() {
309 let word
= Word
::from(&line
[start
..]);
320 use super::WordSeparator
::*;
323 // Like assert_eq!, but the left expression is an iterator.
324 macro_rules
! assert_iter_eq
{
325 ($left
:expr
, $right
:expr
) => {
326 assert_eq
!($left
.collect
::<Vec
<_
>>(), $right
);
330 fn to_words(words
: Vec
<&str>) -> Vec
<Word
<'_
>> {
331 words
.into_iter().map(Word
::from
).collect()
334 macro_rules
! test_find_words
{
337 $
([ $line
:expr
, $ascii_words
:expr
, $unicode_words
:expr
]),+) => {
341 let expected_words
= to_words($ascii_words
.to_vec());
342 let actual_words
= WordSeparator
::AsciiSpace
344 .collect
::<Vec
<_
>>();
345 assert_eq
!(actual_words
, expected_words
, "Line: {:?}", $line
);
350 #[cfg(feature = "unicode-linebreak")]
353 let expected_words
= to_words($unicode_words
.to_vec());
354 let actual_words
= WordSeparator
::UnicodeBreakProperties
356 .collect
::<Vec
<_
>>();
357 assert_eq
!(actual_words
, expected_words
, "Line: {:?}", $line
);
363 test_find_words
!(ascii_space_empty
, unicode_empty
, ["", [], []]);
368 ["foo", ["foo"], ["foo"]]
374 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
378 ascii_multiple_words
,
379 unicode_multiple_words
,
380 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
381 ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
385 ascii_only_whitespace
,
386 unicode_only_whitespace
,
392 ascii_inter_word_whitespace
,
393 unicode_inter_word_whitespace
,
394 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
398 ascii_trailing_whitespace
,
399 unicode_trailing_whitespace
,
400 ["foo ", ["foo "], ["foo "]]
404 ascii_leading_whitespace
,
405 unicode_leading_whitespace
,
406 [" foo", [" ", "foo"], [" ", "foo"]]
410 ascii_multi_column_char
,
411 unicode_multi_column_char
,
412 ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji π€
418 ["foo-bar", ["foo-bar"], ["foo-bar"]],
419 ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
420 ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
421 ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
427 ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
433 ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
437 ascii_non_breaking_space
,
438 unicode_non_breaking_space
,
439 ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
444 fn find_words_colored_text() {
445 use termion
::color
::{Blue, Fg, Green, Reset}
;
447 let green_hello
= format
!("{}Hello{} ", Fg(Green
), Fg(Reset
));
448 let blue_world
= format
!("{}World!{}", Fg(Blue
), Fg(Reset
));
450 AsciiSpace
.find_words(&format
!("{}{}", green_hello
, blue_world
)),
451 vec
![Word
::from(&green_hello
), Word
::from(&blue_world
)]
454 #[cfg(feature = "unicode-linebreak")]
456 UnicodeBreakProperties
.find_words(&format
!("{}{}", green_hello
, blue_world
)),
457 vec
![Word
::from(&green_hello
), Word
::from(&blue_world
)]
462 fn find_words_color_inside_word() {
463 let text
= "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
464 assert_iter_eq
!(AsciiSpace
.find_words(text
), vec
![Word
::from(text
)]);
466 #[cfg(feature = "unicode-linebreak")]
468 UnicodeBreakProperties
.find_words(text
),
469 vec
![Word
::from(text
)]
474 fn word_separator_new() {
475 #[cfg(feature = "unicode-linebreak")]
476 assert
!(matches
!(WordSeparator
::new(), UnicodeBreakProperties
));
478 #[cfg(not(feature = "unicode-linebreak"))]
479 assert
!(matches
!(WordSeparator
::new(), AsciiSpace
));