]> git.proxmox.com Git - cargo.git/blob - vendor/textwrap/src/word_separators.rs
New upstream version 0.63.1
[cargo.git] / vendor / textwrap / src / word_separators.rs
1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see
29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30 /// hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::WordSeparator::AsciiSpace;
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 #[derive(Clone, Copy)]
42 pub enum WordSeparator {
43 /// Find words by splitting on runs of `' '` characters.
44 ///
45 /// # Examples
46 ///
47 /// ```
48 /// use textwrap::core::Word;
49 /// use textwrap::WordSeparator::AsciiSpace;
50 ///
51 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
52 /// assert_eq!(words, vec![Word::from("Hello "),
53 /// Word::from("World!")]);
54 /// ```
55 AsciiSpace,
56
57 /// Split `line` into words using Unicode break properties.
58 ///
59 /// This word separator uses the Unicode line breaking algorithm
60 /// described in [Unicode Standard Annex
61 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62 /// to break lines. There is a small difference in that the U+002D
63 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
64 /// to allow a line break at a hyphen, use
65 /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66 /// Soft hyphens are not currently supported.
67 ///
68 /// # Examples
69 ///
70 /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71 /// breaking algorithm will find line break opportunities between
72 /// some characters with no intervening whitespace:
73 ///
74 /// ```
75 /// #[cfg(feature = "unicode-linebreak")] {
76 /// use textwrap::core::Word;
77 /// use textwrap::WordSeparator::UnicodeBreakProperties;
78 ///
79 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚πŸ˜").collect::<Vec<_>>(),
80 /// vec![Word::from("Emojis: "),
81 /// Word::from("πŸ˜‚"),
82 /// Word::from("😍")]);
83 ///
84 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: δ½ ε₯½").collect::<Vec<_>>(),
85 /// vec![Word::from("CJK: "),
86 /// Word::from("δ½ "),
87 /// Word::from("ε₯½")]);
88 /// }
89 /// ```
90 ///
91 /// A U+2060 (Word Joiner) character can be inserted if you want to
92 /// manually override the defaults and keep the characters together:
93 ///
94 /// ```
95 /// #[cfg(feature = "unicode-linebreak")] {
96 /// use textwrap::core::Word;
97 /// use textwrap::WordSeparator::UnicodeBreakProperties;
98 ///
99 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: πŸ˜‚\u{2060}😍").collect::<Vec<_>>(),
100 /// vec![Word::from("Emojis: "),
101 /// Word::from("πŸ˜‚\u{2060}😍")]);
102 /// }
103 /// ```
104 ///
105 /// The Unicode line breaking algorithm will also automatically
106 /// suppress break breaks around certain punctuation characters::
107 ///
108 /// ```
109 /// #[cfg(feature = "unicode-linebreak")] {
110 /// use textwrap::core::Word;
111 /// use textwrap::WordSeparator::UnicodeBreakProperties;
112 ///
113 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114 /// vec![Word::from("[ foo ] "),
115 /// Word::from("bar !")]);
116 /// }
117 /// ```
118 #[cfg(feature = "unicode-linebreak")]
119 UnicodeBreakProperties,
120
121 /// Find words using a custom word separator
122 Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123 }
124
125 impl std::fmt::Debug for WordSeparator {
126 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
127 match self {
128 WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
129 #[cfg(feature = "unicode-linebreak")]
130 WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
131 WordSeparator::Custom(_) => f.write_str("Custom(...)"),
132 }
133 }
134 }
135
136 impl WordSeparator {
137 /// Create a new word separator.
138 ///
139 /// The best available algorithm is used by default, i.e.,
140 /// [`WordSeparator::UnicodeBreakProperties`] if available,
141 /// otherwise [`WordSeparator::AsciiSpace`].
142 pub const fn new() -> Self {
143 #[cfg(feature = "unicode-linebreak")]
144 {
145 WordSeparator::UnicodeBreakProperties
146 }
147
148 #[cfg(not(feature = "unicode-linebreak"))]
149 {
150 WordSeparator::AsciiSpace
151 }
152 }
153
154 // This function should really return impl Iterator<Item = Word>, but
155 // this isn't possible until Rust supports higher-kinded types:
156 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
157 /// Find all words in `line`.
158 pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
159 match self {
160 WordSeparator::AsciiSpace => find_words_ascii_space(line),
161 #[cfg(feature = "unicode-linebreak")]
162 WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
163 WordSeparator::Custom(func) => func(line),
164 }
165 }
166 }
167
168 fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
169 let mut start = 0;
170 let mut in_whitespace = false;
171 let mut char_indices = line.char_indices();
172
173 Box::new(std::iter::from_fn(move || {
174 for (idx, ch) in char_indices.by_ref() {
175 if in_whitespace && ch != ' ' {
176 let word = Word::from(&line[start..idx]);
177 start = idx;
178 in_whitespace = ch == ' ';
179 return Some(word);
180 }
181
182 in_whitespace = ch == ' ';
183 }
184
185 if start < line.len() {
186 let word = Word::from(&line[start..]);
187 start = line.len();
188 return Some(word);
189 }
190
191 None
192 }))
193 }
194
195 // Strip all ANSI escape sequences from `text`.
196 #[cfg(feature = "unicode-linebreak")]
197 fn strip_ansi_escape_sequences(text: &str) -> String {
198 let mut result = String::with_capacity(text.len());
199
200 let mut chars = text.chars();
201 while let Some(ch) = chars.next() {
202 if skip_ansi_escape_sequence(ch, &mut chars) {
203 continue;
204 }
205 result.push(ch);
206 }
207
208 result
209 }
210
211 /// Soft hyphen, also knows as a β€œshy hyphen”. Should show up as β€˜-’
212 /// if a line is broken at this point, and otherwise be invisible.
213 /// Textwrap does not currently support breaking words at soft
214 /// hyphens.
215 #[cfg(feature = "unicode-linebreak")]
216 const SHY: char = '\u{00ad}';
217
218 /// Find words in line. ANSI escape sequences are ignored in `line`.
219 #[cfg(feature = "unicode-linebreak")]
220 fn find_words_unicode_break_properties<'a>(
221 line: &'a str,
222 ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
223 // Construct an iterator over (original index, stripped index)
224 // tuples. We find the Unicode linebreaks on a stripped string,
225 // but we need the original indices so we can form words based on
226 // the original string.
227 let mut last_stripped_idx = 0;
228 let mut char_indices = line.char_indices();
229 let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
230 Some((orig_idx, ch)) => {
231 let stripped_idx = last_stripped_idx;
232 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
233 last_stripped_idx += ch.len_utf8();
234 }
235 Some((orig_idx, stripped_idx))
236 }
237 None => None,
238 });
239
240 let stripped = strip_ansi_escape_sequences(line);
241 let mut opportunities = unicode_linebreak::linebreaks(&stripped)
242 .filter(|(idx, _)| {
243 #[allow(clippy::match_like_matches_macro)]
244 match &stripped[..*idx].chars().next_back() {
245 // We suppress breaks at β€˜-’ since we want to control
246 // this via the WordSplitter.
247 Some('-') => false,
248 // Soft hyphens are currently not supported since we
249 // require all `Word` fragments to be continuous in
250 // the input string.
251 Some(SHY) => false,
252 // Other breaks should be fine!
253 _ => true,
254 }
255 })
256 .collect::<Vec<_>>()
257 .into_iter();
258
259 // Remove final break opportunity, we will add it below using
260 // &line[start..]; This ensures that we correctly include a
261 // trailing ANSI escape sequence.
262 opportunities.next_back();
263
264 let mut start = 0;
265 Box::new(std::iter::from_fn(move || {
266 for (idx, _) in opportunities.by_ref() {
267 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
268 let word = Word::from(&line[start..orig_idx]);
269 start = orig_idx;
270 return Some(word);
271 }
272 }
273
274 if start < line.len() {
275 let word = Word::from(&line[start..]);
276 start = line.len();
277 return Some(word);
278 }
279
280 None
281 }))
282 }
283
284 #[cfg(test)]
285 mod tests {
286 use super::WordSeparator::*;
287 use super::*;
288
289 // Like assert_eq!, but the left expression is an iterator.
290 macro_rules! assert_iter_eq {
291 ($left:expr, $right:expr) => {
292 assert_eq!($left.collect::<Vec<_>>(), $right);
293 };
294 }
295
296 fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
297 words.into_iter().map(Word::from).collect()
298 }
299
300 macro_rules! test_find_words {
301 ($ascii_name:ident,
302 $unicode_name:ident,
303 $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
304 #[test]
305 fn $ascii_name() {
306 $(
307 let expected_words = to_words($ascii_words.to_vec());
308 let actual_words = WordSeparator::AsciiSpace
309 .find_words($line)
310 .collect::<Vec<_>>();
311 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
312 )+
313 }
314
315 #[test]
316 #[cfg(feature = "unicode-linebreak")]
317 fn $unicode_name() {
318 $(
319 let expected_words = to_words($unicode_words.to_vec());
320 let actual_words = WordSeparator::UnicodeBreakProperties
321 .find_words($line)
322 .collect::<Vec<_>>();
323 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
324 )+
325 }
326 };
327 }
328
329 test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
330
331 test_find_words!(
332 ascii_single_word,
333 unicode_single_word,
334 ["foo", ["foo"], ["foo"]]
335 );
336
337 test_find_words!(
338 ascii_two_words,
339 unicode_two_words,
340 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
341 );
342
343 test_find_words!(
344 ascii_multiple_words,
345 unicode_multiple_words,
346 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
347 ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
348 );
349
350 test_find_words!(
351 ascii_only_whitespace,
352 unicode_only_whitespace,
353 [" ", [" "], [" "]],
354 [" ", [" "], [" "]]
355 );
356
357 test_find_words!(
358 ascii_inter_word_whitespace,
359 unicode_inter_word_whitespace,
360 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
361 );
362
363 test_find_words!(
364 ascii_trailing_whitespace,
365 unicode_trailing_whitespace,
366 ["foo ", ["foo "], ["foo "]]
367 );
368
369 test_find_words!(
370 ascii_leading_whitespace,
371 unicode_leading_whitespace,
372 [" foo", [" ", "foo"], [" ", "foo"]]
373 );
374
375 test_find_words!(
376 ascii_multi_column_char,
377 unicode_multi_column_char,
378 ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji 🀠
379 );
380
381 test_find_words!(
382 ascii_hyphens,
383 unicode_hyphens,
384 ["foo-bar", ["foo-bar"], ["foo-bar"]],
385 ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
386 ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
387 ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
388 );
389
390 test_find_words!(
391 ascii_newline,
392 unicode_newline,
393 ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
394 );
395
396 test_find_words!(
397 ascii_tab,
398 unicode_tab,
399 ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
400 );
401
402 test_find_words!(
403 ascii_non_breaking_space,
404 unicode_non_breaking_space,
405 ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
406 );
407
408 #[test]
409 #[cfg(unix)]
410 fn find_words_colored_text() {
411 use termion::color::{Blue, Fg, Green, Reset};
412
413 let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
414 let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
415 assert_iter_eq!(
416 AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
417 vec![Word::from(&green_hello), Word::from(&blue_world)]
418 );
419
420 #[cfg(feature = "unicode-linebreak")]
421 assert_iter_eq!(
422 UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
423 vec![Word::from(&green_hello), Word::from(&blue_world)]
424 );
425 }
426
427 #[test]
428 fn find_words_color_inside_word() {
429 let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
430 assert_iter_eq!(AsciiSpace.find_words(text), vec![Word::from(text)]);
431
432 #[cfg(feature = "unicode-linebreak")]
433 assert_iter_eq!(
434 UnicodeBreakProperties.find_words(text),
435 vec![Word::from(text)]
436 );
437 }
438
439 #[test]
440 fn word_separator_new() {
441 #[cfg(feature = "unicode-linebreak")]
442 assert!(matches!(WordSeparator::new(), UnicodeBreakProperties));
443
444 #[cfg(not(feature = "unicode-linebreak"))]
445 assert!(matches!(WordSeparator::new(), AsciiSpace));
446 }
447 }