vendor/textwrap/src/word_separators.rs

   1 //! Functionality for finding words.
   2 //!
   3 //! In order to wrap text, we need to know where the legal break
   4 //! points are, i.e., where the words of the text are. This means that
   5 //! we need to define what a "word" is.
   6 //!
   7 //! A simple approach is to simply split the text on whitespace, but
   8 //! this does not work for East-Asian languages such as Chinese or
   9 //! Japanese where there are no spaces between words. Breaking a long
  10 //! sequence of emojis is another example where line breaks might be
  11 //! wanted even if there are no whitespace to be found.
  12 //!
  13 //! The [`WordSeparator`] trait is responsible for determining where
  14 //! there words are in a line of text. Please refer to the trait and
  15 //! the structs which implement it for more information.
  16
  17 #[cfg(feature = "unicode-linebreak")]
  18 use crate::core::skip_ansi_escape_sequence;
  19 use crate::core::Word;
  20
  21 /// Describes where words occur in a line of text.
  22 ///
  23 /// The simplest approach is say that words are separated by one or
  24 /// more ASCII spaces (`' '`). This works for Western languages
  25 /// without emojis. A more complex approach is to use the Unicode line
  26 /// breaking algorithm, which finds break points in non-ASCII text.
  27 ///
  28 /// The line breaks occur between words, please see
  29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
  30 /// hyphenation of individual words.
  31 ///
  32 /// # Examples
  33 ///
  34 /// ```
  35 /// use textwrap::core::Word;
  36 /// use textwrap::WordSeparator::AsciiSpace;
  37 ///
  38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
  39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
  40 /// ```
  41 #[derive(Clone, Copy)]
  42 pub enum WordSeparator {
  43     /// Find words by splitting on runs of `' '` characters.
  44     ///
  45     /// # Examples
  46     ///
  47     /// ```
  48     /// use textwrap::core::Word;
  49     /// use textwrap::WordSeparator::AsciiSpace;
  50     ///
  51     /// let words = AsciiSpace.find_words("Hello   World!").collect::<Vec<_>>();
  52     /// assert_eq!(words, vec![Word::from("Hello   "),
  53     ///                        Word::from("World!")]);
  54     /// ```
  55     AsciiSpace,
  56
  57     /// Split `line` into words using Unicode break properties.
  58     ///
  59     /// This word separator uses the Unicode line breaking algorithm
  60     /// described in [Unicode Standard Annex
  61     /// #14](https://www.unicode.org/reports/tr14/) to find legal places
  62     /// to break lines. There is a small difference in that the U+002D
  63     /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
  64     /// to allow a line break at a hyphen, use
  65     /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
  66     /// Soft hyphens are not currently supported.
  67     ///
  68     /// # Examples
  69     ///
  70     /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
  71     /// breaking algorithm will find line break opportunities between
  72     /// some characters with no intervening whitespace:
  73     ///
  74     /// ```
  75     /// #[cfg(feature = "unicode-linebreak")] {
  76     /// use textwrap::core::Word;
  77     /// use textwrap::WordSeparator::UnicodeBreakProperties;
  78     ///
  79     /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂😍").collect::<Vec<_>>(),
  80     ///            vec![Word::from("Emojis: "),
  81     ///                 Word::from("😂"),
  82     ///                 Word::from("😍")]);
  83     ///
  84     /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
  85     ///            vec![Word::from("CJK: "),
  86     ///                 Word::from("你"),
  87     ///                 Word::from("好")]);
  88     /// }
  89     /// ```
  90     ///
  91     /// A U+2060 (Word Joiner) character can be inserted if you want to
  92     /// manually override the defaults and keep the characters together:
  93     ///
  94     /// ```
  95     /// #[cfg(feature = "unicode-linebreak")] {
  96     /// use textwrap::core::Word;
  97     /// use textwrap::WordSeparator::UnicodeBreakProperties;
  98     ///
  99     /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: 😂\u{2060}😍").collect::<Vec<_>>(),
 100     ///            vec![Word::from("Emojis: "),
 101     ///                 Word::from("😂\u{2060}😍")]);
 102     /// }
 103     /// ```
 104     ///
 105     /// The Unicode line breaking algorithm will also automatically
 106     /// suppress break breaks around certain punctuation characters::
 107     ///
 108     /// ```
 109     /// #[cfg(feature = "unicode-linebreak")] {
 110     /// use textwrap::core::Word;
 111     /// use textwrap::WordSeparator::UnicodeBreakProperties;
 112     ///
 113     /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
 114     ///            vec![Word::from("[ foo ] "),
 115     ///                 Word::from("bar !")]);
 116     /// }
 117     /// ```
 118     #[cfg(feature = "unicode-linebreak")]
 119     UnicodeBreakProperties,
 120
 121     /// Find words using a custom word separator
 122     Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
 123 }
 124
 125 impl PartialEq for WordSeparator {
 126     /// Compare two word separators.
 127     ///
 128     /// ```
 129     /// use textwrap::WordSeparator;
 130     ///
 131     /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
 132     /// #[cfg(feature = "unicode-linebreak")] {
 133     ///     assert_eq!(WordSeparator::UnicodeBreakProperties,
 134     ///                WordSeparator::UnicodeBreakProperties);
 135     /// }
 136     /// ```
 137     ///
 138     /// Note that `WordSeparator::Custom` values never compare equal:
 139     ///
 140     /// ```
 141     /// use textwrap::WordSeparator;
 142     /// use textwrap::core::Word;
 143     /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
 144     ///     Box::new(line.split_inclusive(' ').map(Word::from))
 145     /// }
 146     /// assert_ne!(WordSeparator::Custom(word_separator),
 147     ///            WordSeparator::Custom(word_separator));
 148     /// ```
 149     fn eq(&self, other: &Self) -> bool {
 150         match (self, other) {
 151             (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
 152             #[cfg(feature = "unicode-linebreak")]
 153             (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
 154             (_, _) => false,
 155         }
 156     }
 157 }
 158
 159 impl std::fmt::Debug for WordSeparator {
 160     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 161         match self {
 162             WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
 163             #[cfg(feature = "unicode-linebreak")]
 164             WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
 165             WordSeparator::Custom(_) => f.write_str("Custom(...)"),
 166         }
 167     }
 168 }
 169
 170 impl WordSeparator {
 171     /// Create a new word separator.
 172     ///
 173     /// The best available algorithm is used by default, i.e.,
 174     /// [`WordSeparator::UnicodeBreakProperties`] if available,
 175     /// otherwise [`WordSeparator::AsciiSpace`].
 176     pub const fn new() -> Self {
 177         #[cfg(feature = "unicode-linebreak")]
 178         {
 179             WordSeparator::UnicodeBreakProperties
 180         }
 181
 182         #[cfg(not(feature = "unicode-linebreak"))]
 183         {
 184             WordSeparator::AsciiSpace
 185         }
 186     }
 187
 188     // This function should really return impl Iterator<Item = Word>, but
 189     // this isn't possible until Rust supports higher-kinded types:
 190     // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
 191     /// Find all words in `line`.
 192     pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
 193         match self {
 194             WordSeparator::AsciiSpace => find_words_ascii_space(line),
 195             #[cfg(feature = "unicode-linebreak")]
 196             WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
 197             WordSeparator::Custom(func) => func(line),
 198         }
 199     }
 200 }
 201
 202 fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
 203     let mut start = 0;
 204     let mut in_whitespace = false;
 205     let mut char_indices = line.char_indices();
 206
 207     Box::new(std::iter::from_fn(move || {
 208         for (idx, ch) in char_indices.by_ref() {
 209             if in_whitespace && ch != ' ' {
 210                 let word = Word::from(&line[start..idx]);
 211                 start = idx;
 212                 in_whitespace = ch == ' ';
 213                 return Some(word);
 214             }
 215
 216             in_whitespace = ch == ' ';
 217         }
 218
 219         if start < line.len() {
 220             let word = Word::from(&line[start..]);
 221             start = line.len();
 222             return Some(word);
 223         }
 224
 225         None
 226     }))
 227 }
 228
 229 // Strip all ANSI escape sequences from `text`.
 230 #[cfg(feature = "unicode-linebreak")]
 231 fn strip_ansi_escape_sequences(text: &str) -> String {
 232     let mut result = String::with_capacity(text.len());
 233
 234     let mut chars = text.chars();
 235     while let Some(ch) = chars.next() {
 236         if skip_ansi_escape_sequence(ch, &mut chars) {
 237             continue;
 238         }
 239         result.push(ch);
 240     }
 241
 242     result
 243 }
 244
 245 /// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
 246 /// if a line is broken at this point, and otherwise be invisible.
 247 /// Textwrap does not currently support breaking words at soft
 248 /// hyphens.
 249 #[cfg(feature = "unicode-linebreak")]
 250 const SHY: char = '\u{00ad}';
 251
 252 /// Find words in line. ANSI escape sequences are ignored in `line`.
 253 #[cfg(feature = "unicode-linebreak")]
 254 fn find_words_unicode_break_properties<'a>(
 255     line: &'a str,
 256 ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
 257     // Construct an iterator over (original index, stripped index)
 258     // tuples. We find the Unicode linebreaks on a stripped string,
 259     // but we need the original indices so we can form words based on
 260     // the original string.
 261     let mut last_stripped_idx = 0;
 262     let mut char_indices = line.char_indices();
 263     let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
 264         Some((orig_idx, ch)) => {
 265             let stripped_idx = last_stripped_idx;
 266             if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
 267                 last_stripped_idx += ch.len_utf8();
 268             }
 269             Some((orig_idx, stripped_idx))
 270         }
 271         None => None,
 272     });
 273
 274     let stripped = strip_ansi_escape_sequences(line);
 275     let mut opportunities = unicode_linebreak::linebreaks(&stripped)
 276         .filter(|(idx, _)| {
 277             #[allow(clippy::match_like_matches_macro)]
 278             match &stripped[..*idx].chars().next_back() {
 279                 // We suppress breaks at ‘-’ since we want to control
 280                 // this via the WordSplitter.
 281                 Some('-') => false,
 282                 // Soft hyphens are currently not supported since we
 283                 // require all `Word` fragments to be continuous in
 284                 // the input string.
 285                 Some(SHY) => false,
 286                 // Other breaks should be fine!
 287                 _ => true,
 288             }
 289         })
 290         .collect::<Vec<_>>()
 291         .into_iter();
 292
 293     // Remove final break opportunity, we will add it below using
 294     // &line[start..]; This ensures that we correctly include a
 295     // trailing ANSI escape sequence.
 296     opportunities.next_back();
 297
 298     let mut start = 0;
 299     Box::new(std::iter::from_fn(move || {
 300         for (idx, _) in opportunities.by_ref() {
 301             if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
 302                 let word = Word::from(&line[start..orig_idx]);
 303                 start = orig_idx;
 304                 return Some(word);
 305             }
 306         }
 307
 308         if start < line.len() {
 309             let word = Word::from(&line[start..]);
 310             start = line.len();
 311             return Some(word);
 312         }
 313
 314         None
 315     }))
 316 }
 317
 318 #[cfg(test)]
 319 mod tests {
 320     use super::WordSeparator::*;
 321     use super::*;
 322
 323     // Like assert_eq!, but the left expression is an iterator.
 324     macro_rules! assert_iter_eq {
 325         ($left:expr, $right:expr) => {
 326             assert_eq!($left.collect::<Vec<_>>(), $right);
 327         };
 328     }
 329
 330     fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
 331         words.into_iter().map(Word::from).collect()
 332     }
 333
 334     macro_rules! test_find_words {
 335         ($ascii_name:ident,
 336          $unicode_name:ident,
 337          $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
 338             #[test]
 339             fn $ascii_name() {
 340                 $(
 341                     let expected_words = to_words($ascii_words.to_vec());
 342                     let actual_words = WordSeparator::AsciiSpace
 343                         .find_words($line)
 344                         .collect::<Vec<_>>();
 345                     assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
 346                 )+
 347             }
 348
 349             #[test]
 350             #[cfg(feature = "unicode-linebreak")]
 351             fn $unicode_name() {
 352                 $(
 353                     let expected_words = to_words($unicode_words.to_vec());
 354                     let actual_words = WordSeparator::UnicodeBreakProperties
 355                         .find_words($line)
 356                         .collect::<Vec<_>>();
 357                     assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
 358                 )+
 359             }
 360         };
 361     }
 362
 363     test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
 364
 365     test_find_words!(
 366         ascii_single_word,
 367         unicode_single_word,
 368         ["foo", ["foo"], ["foo"]]
 369     );
 370
 371     test_find_words!(
 372         ascii_two_words,
 373         unicode_two_words,
 374         ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
 375     );
 376
 377     test_find_words!(
 378         ascii_multiple_words,
 379         unicode_multiple_words,
 380         ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
 381         ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
 382     );
 383
 384     test_find_words!(
 385         ascii_only_whitespace,
 386         unicode_only_whitespace,
 387         [" ", [" "], [" "]],
 388         ["    ", ["    "], ["    "]]
 389     );
 390
 391     test_find_words!(
 392         ascii_inter_word_whitespace,
 393         unicode_inter_word_whitespace,
 394         ["foo   bar", ["foo   ", "bar"], ["foo   ", "bar"]]
 395     );
 396
 397     test_find_words!(
 398         ascii_trailing_whitespace,
 399         unicode_trailing_whitespace,
 400         ["foo   ", ["foo   "], ["foo   "]]
 401     );
 402
 403     test_find_words!(
 404         ascii_leading_whitespace,
 405         unicode_leading_whitespace,
 406         ["   foo", ["   ", "foo"], ["   ", "foo"]]
 407     );
 408
 409     test_find_words!(
 410         ascii_multi_column_char,
 411         unicode_multi_column_char,
 412         ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🤠
 413     );
 414
 415     test_find_words!(
 416         ascii_hyphens,
 417         unicode_hyphens,
 418         ["foo-bar", ["foo-bar"], ["foo-bar"]],
 419         ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
 420         ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
 421         ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
 422     );
 423
 424     test_find_words!(
 425         ascii_newline,
 426         unicode_newline,
 427         ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
 428     );
 429
 430     test_find_words!(
 431         ascii_tab,
 432         unicode_tab,
 433         ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
 434     );
 435
 436     test_find_words!(
 437         ascii_non_breaking_space,
 438         unicode_non_breaking_space,
 439         ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
 440     );
 441
 442     #[test]
 443     #[cfg(unix)]
 444     fn find_words_colored_text() {
 445         use termion::color::{Blue, Fg, Green, Reset};
 446
 447         let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
 448         let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
 449         assert_iter_eq!(
 450             AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
 451             vec![Word::from(&green_hello), Word::from(&blue_world)]
 452         );
 453
 454         #[cfg(feature = "unicode-linebreak")]
 455         assert_iter_eq!(
 456             UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
 457             vec![Word::from(&green_hello), Word::from(&blue_world)]
 458         );
 459     }
 460
 461     #[test]
 462     fn find_words_color_inside_word() {
 463         let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
 464         assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
 465
 466         #[cfg(feature = "unicode-linebreak")]
 467         assert_iter_eq!(
 468             UnicodeBreakProperties.find_words(text),
 469             vec![Word::from(text)]
 470         );
 471     }
 472
 473     #[test]
 474     fn word_separator_new() {
 475         #[cfg(feature = "unicode-linebreak")]
 476         assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
 477
 478         #[cfg(not(feature = "unicode-linebreak"))]
 479         assert!(matches!(WordSeparator::new(), AsciiSpace));
 480     }
 481 }