]> git.proxmox.com Git - cargo.git/blob - vendor/textwrap/src/word_separators.rs
bump version to 0.66.0+pve1-1~bpo11+pve1
[cargo.git] / vendor / textwrap / src / word_separators.rs
1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see
29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30 /// hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::WordSeparator::AsciiSpace;
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 #[derive(Clone, Copy)]
42 pub enum WordSeparator {
43 /// Find words by splitting on runs of `' '` characters.
44 ///
45 /// # Examples
46 ///
47 /// ```
48 /// use textwrap::core::Word;
49 /// use textwrap::WordSeparator::AsciiSpace;
50 ///
51 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
52 /// assert_eq!(words, vec![Word::from("Hello "),
53 /// Word::from("World!")]);
54 /// ```
55 AsciiSpace,
56
57 /// Split `line` into words using Unicode break properties.
58 ///
59 /// This word separator uses the Unicode line breaking algorithm
60 /// described in [Unicode Standard Annex
61 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62 /// to break lines. There is a small difference in that the U+002D
63 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
64 /// to allow a line break at a hyphen, use
65 /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66 /// Soft hyphens are not currently supported.
67 ///
68 /// # Examples
69 ///
70 /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71 /// breaking algorithm will find line break opportunities between
72 /// some characters with no intervening whitespace:
73 ///
74 /// ```
75 /// #[cfg(feature = "unicode-linebreak")] {
76 /// use textwrap::core::Word;
77 /// use textwrap::WordSeparator::UnicodeBreakProperties;
78 ///
79 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚πŸ˜").collect::<Vec<_>>(),
80 /// vec![Word::from("Emojis: "),
81 /// Word::from("πŸ˜‚"),
82 /// Word::from("😍")]);
83 ///
84 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
85 /// vec![Word::from("CJK: "),
86 /// Word::from("δ½ "),
87 /// Word::from("ε₯½")]);
88 /// }
89 /// ```
90 ///
91 /// A U+2060 (Word Joiner) character can be inserted if you want to
92 /// manually override the defaults and keep the characters together:
93 ///
94 /// ```
95 /// #[cfg(feature = "unicode-linebreak")] {
96 /// use textwrap::core::Word;
97 /// use textwrap::WordSeparator::UnicodeBreakProperties;
98 ///
99 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚\u{2060}😍").collect::<Vec<_>>(),
100 /// vec![Word::from("Emojis: "),
101 /// Word::from("πŸ˜‚\u{2060}😍")]);
102 /// }
103 /// ```
104 ///
105 /// The Unicode line breaking algorithm will also automatically
106 /// suppress break breaks around certain punctuation characters::
107 ///
108 /// ```
109 /// #[cfg(feature = "unicode-linebreak")] {
110 /// use textwrap::core::Word;
111 /// use textwrap::WordSeparator::UnicodeBreakProperties;
112 ///
113 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114 /// vec![Word::from("[ foo ] "),
115 /// Word::from("bar !")]);
116 /// }
117 /// ```
118 #[cfg(feature = "unicode-linebreak")]
119 UnicodeBreakProperties,
120
121 /// Find words using a custom word separator
122 Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123 }
124
125 impl PartialEq for WordSeparator {
126 /// Compare two word separators.
127 ///
128 /// ```
129 /// use textwrap::WordSeparator;
130 ///
131 /// assert_eq!(WordSeparator::AsciiSpace, WordSeparator::AsciiSpace);
132 /// #[cfg(feature = "unicode-linebreak")] {
133 /// assert_eq!(WordSeparator::UnicodeBreakProperties,
134 /// WordSeparator::UnicodeBreakProperties);
135 /// }
136 /// ```
137 ///
138 /// Note that `WordSeparator::Custom` values never compare equal:
139 ///
140 /// ```
141 /// use textwrap::WordSeparator;
142 /// use textwrap::core::Word;
143 /// fn word_separator(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_> {
144 /// Box::new(line.split_inclusive(' ').map(Word::from))
145 /// }
146 /// assert_ne!(WordSeparator::Custom(word_separator),
147 /// WordSeparator::Custom(word_separator));
148 /// ```
149 fn eq(&self, other: &Self) -> bool {
150 match (self, other) {
151 (WordSeparator::AsciiSpace, WordSeparator::AsciiSpace) => true,
152 #[cfg(feature = "unicode-linebreak")]
153 (WordSeparator::UnicodeBreakProperties, WordSeparator::UnicodeBreakProperties) => true,
154 (_, _) => false,
155 }
156 }
157 }
158
159 impl std::fmt::Debug for WordSeparator {
160 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
161 match self {
162 WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
163 #[cfg(feature = "unicode-linebreak")]
164 WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
165 WordSeparator::Custom(_) => f.write_str("Custom(...)"),
166 }
167 }
168 }
169
170 impl WordSeparator {
171 /// Create a new word separator.
172 ///
173 /// The best available algorithm is used by default, i.e.,
174 /// [`WordSeparator::UnicodeBreakProperties`] if available,
175 /// otherwise [`WordSeparator::AsciiSpace`].
176 pub const fn new() -> Self {
177 #[cfg(feature = "unicode-linebreak")]
178 {
179 WordSeparator::UnicodeBreakProperties
180 }
181
182 #[cfg(not(feature = "unicode-linebreak"))]
183 {
184 WordSeparator::AsciiSpace
185 }
186 }
187
188 // This function should really return impl Iterator<Item = Word>, but
189 // this isn't possible until Rust supports higher-kinded types:
190 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
191 /// Find all words in `line`.
192 pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
193 match self {
194 WordSeparator::AsciiSpace => find_words_ascii_space(line),
195 #[cfg(feature = "unicode-linebreak")]
196 WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
197 WordSeparator::Custom(func) => func(line),
198 }
199 }
200 }
201
202 fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
203 let mut start = 0;
204 let mut in_whitespace = false;
205 let mut char_indices = line.char_indices();
206
207 Box::new(std::iter::from_fn(move || {
208 for (idx, ch) in char_indices.by_ref() {
209 if in_whitespace && ch != ' ' {
210 let word = Word::from(&line[start..idx]);
211 start = idx;
212 in_whitespace = ch == ' ';
213 return Some(word);
214 }
215
216 in_whitespace = ch == ' ';
217 }
218
219 if start < line.len() {
220 let word = Word::from(&line[start..]);
221 start = line.len();
222 return Some(word);
223 }
224
225 None
226 }))
227 }
228
229 // Strip all ANSI escape sequences from `text`.
230 #[cfg(feature = "unicode-linebreak")]
231 fn strip_ansi_escape_sequences(text: &str) -> String {
232 let mut result = String::with_capacity(text.len());
233
234 let mut chars = text.chars();
235 while let Some(ch) = chars.next() {
236 if skip_ansi_escape_sequence(ch, &mut chars) {
237 continue;
238 }
239 result.push(ch);
240 }
241
242 result
243 }
244
245 /// Soft hyphen, also knows as a β€œshy hyphen”. Should show up as β€˜-’
246 /// if a line is broken at this point, and otherwise be invisible.
247 /// Textwrap does not currently support breaking words at soft
248 /// hyphens.
249 #[cfg(feature = "unicode-linebreak")]
250 const SHY: char = '\u{00ad}';
251
252 /// Find words in line. ANSI escape sequences are ignored in `line`.
253 #[cfg(feature = "unicode-linebreak")]
254 fn find_words_unicode_break_properties<'a>(
255 line: &'a str,
256 ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
257 // Construct an iterator over (original index, stripped index)
258 // tuples. We find the Unicode linebreaks on a stripped string,
259 // but we need the original indices so we can form words based on
260 // the original string.
261 let mut last_stripped_idx = 0;
262 let mut char_indices = line.char_indices();
263 let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
264 Some((orig_idx, ch)) => {
265 let stripped_idx = last_stripped_idx;
266 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
267 last_stripped_idx += ch.len_utf8();
268 }
269 Some((orig_idx, stripped_idx))
270 }
271 None => None,
272 });
273
274 let stripped = strip_ansi_escape_sequences(line);
275 let mut opportunities = unicode_linebreak::linebreaks(&stripped)
276 .filter(|(idx, _)| {
277 #[allow(clippy::match_like_matches_macro)]
278 match &stripped[..*idx].chars().next_back() {
279 // We suppress breaks at β€˜-’ since we want to control
280 // this via the WordSplitter.
281 Some('-') => false,
282 // Soft hyphens are currently not supported since we
283 // require all `Word` fragments to be continuous in
284 // the input string.
285 Some(SHY) => false,
286 // Other breaks should be fine!
287 _ => true,
288 }
289 })
290 .collect::<Vec<_>>()
291 .into_iter();
292
293 // Remove final break opportunity, we will add it below using
294 // &line[start..]; This ensures that we correctly include a
295 // trailing ANSI escape sequence.
296 opportunities.next_back();
297
298 let mut start = 0;
299 Box::new(std::iter::from_fn(move || {
300 for (idx, _) in opportunities.by_ref() {
301 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
302 let word = Word::from(&line[start..orig_idx]);
303 start = orig_idx;
304 return Some(word);
305 }
306 }
307
308 if start < line.len() {
309 let word = Word::from(&line[start..]);
310 start = line.len();
311 return Some(word);
312 }
313
314 None
315 }))
316 }
317
318 #[cfg(test)]
319 mod tests {
320 use super::WordSeparator::*;
321 use super::*;
322
323 // Like assert_eq!, but the left expression is an iterator.
324 macro_rules! assert_iter_eq {
325 ($left:expr, $right:expr) => {
326 assert_eq!($left.collect::<Vec<_>>(), $right);
327 };
328 }
329
330 fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
331 words.into_iter().map(Word::from).collect()
332 }
333
334 macro_rules! test_find_words {
335 ($ascii_name:ident,
336 $unicode_name:ident,
337 $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
338 #[test]
339 fn $ascii_name() {
340 $(
341 let expected_words = to_words($ascii_words.to_vec());
342 let actual_words = WordSeparator::AsciiSpace
343 .find_words($line)
344 .collect::<Vec<_>>();
345 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
346 )+
347 }
348
349 #[test]
350 #[cfg(feature = "unicode-linebreak")]
351 fn $unicode_name() {
352 $(
353 let expected_words = to_words($unicode_words.to_vec());
354 let actual_words = WordSeparator::UnicodeBreakProperties
355 .find_words($line)
356 .collect::<Vec<_>>();
357 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
358 )+
359 }
360 };
361 }
362
363 test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
364
365 test_find_words!(
366 ascii_single_word,
367 unicode_single_word,
368 ["foo", ["foo"], ["foo"]]
369 );
370
371 test_find_words!(
372 ascii_two_words,
373 unicode_two_words,
374 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
375 );
376
377 test_find_words!(
378 ascii_multiple_words,
379 unicode_multiple_words,
380 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
381 ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
382 );
383
384 test_find_words!(
385 ascii_only_whitespace,
386 unicode_only_whitespace,
387 [" ", [" "], [" "]],
388 [" ", [" "], [" "]]
389 );
390
391 test_find_words!(
392 ascii_inter_word_whitespace,
393 unicode_inter_word_whitespace,
394 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
395 );
396
397 test_find_words!(
398 ascii_trailing_whitespace,
399 unicode_trailing_whitespace,
400 ["foo ", ["foo "], ["foo "]]
401 );
402
403 test_find_words!(
404 ascii_leading_whitespace,
405 unicode_leading_whitespace,
406 [" foo", [" ", "foo"], [" ", "foo"]]
407 );
408
409 test_find_words!(
410 ascii_multi_column_char,
411 unicode_multi_column_char,
412 ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🀠
413 );
414
415 test_find_words!(
416 ascii_hyphens,
417 unicode_hyphens,
418 ["foo-bar", ["foo-bar"], ["foo-bar"]],
419 ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
420 ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
421 ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
422 );
423
424 test_find_words!(
425 ascii_newline,
426 unicode_newline,
427 ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
428 );
429
430 test_find_words!(
431 ascii_tab,
432 unicode_tab,
433 ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
434 );
435
436 test_find_words!(
437 ascii_non_breaking_space,
438 unicode_non_breaking_space,
439 ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
440 );
441
442 #[test]
443 #[cfg(unix)]
444 fn find_words_colored_text() {
445 use termion::color::{Blue, Fg, Green, Reset};
446
447 let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
448 let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
449 assert_iter_eq!(
450 AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
451 vec![Word::from(&green_hello), Word::from(&blue_world)]
452 );
453
454 #[cfg(feature = "unicode-linebreak")]
455 assert_iter_eq!(
456 UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
457 vec![Word::from(&green_hello), Word::from(&blue_world)]
458 );
459 }
460
461 #[test]
462 fn find_words_color_inside_word() {
463 let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
464 assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
465
466 #[cfg(feature = "unicode-linebreak")]
467 assert_iter_eq!(
468 UnicodeBreakProperties.find_words(text),
469 vec![Word::from(text)]
470 );
471 }
472
473 #[test]
474 fn word_separator_new() {
475 #[cfg(feature = "unicode-linebreak")]
476 assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
477
478 #[cfg(not(feature = "unicode-linebreak"))]
479 assert!(matches!(WordSeparator::new(), AsciiSpace));
480 }
481 }