]>
Commit | Line | Data |
---|---|---|
f035d41b | 1 | use crate::lookups::{ |
dfeec247 XL |
2 | canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, |
3 | stream_safe_trailing_nonstarters, | |
4 | }; | |
f035d41b XL |
5 | use crate::normalize::{hangul_decomposition_length, is_hangul_syllable}; |
6 | use crate::tables::stream_safe_leading_nonstarters; | |
8faf50e0 XL |
7 | |
8 | pub(crate) const MAX_NONSTARTERS: usize = 30; | |
9 | const COMBINING_GRAPHEME_JOINER: char = '\u{034F}'; | |
10 | ||
11 | /// UAX15-D4: This iterator keeps track of how many non-starters there have been | |
12 | /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner | |
13 | /// (U+034F) if the count exceeds 30. | |
14 | pub struct StreamSafe<I> { | |
15 | iter: I, | |
16 | nonstarter_count: usize, | |
17 | buffer: Option<char>, | |
18 | } | |
19 | ||
20 | impl<I> StreamSafe<I> { | |
21 | pub(crate) fn new(iter: I) -> Self { | |
f035d41b XL |
22 | Self { |
23 | iter, | |
24 | nonstarter_count: 0, | |
25 | buffer: None, | |
26 | } | |
8faf50e0 XL |
27 | } |
28 | } | |
29 | ||
f035d41b | 30 | impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> { |
8faf50e0 XL |
31 | type Item = char; |
32 | ||
33 | #[inline] | |
34 | fn next(&mut self) -> Option<char> { | |
6a06907d | 35 | let next_ch = match self.buffer.take().or_else(|| self.iter.next()) { |
8faf50e0 XL |
36 | None => return None, |
37 | Some(c) => c, | |
38 | }; | |
39 | let d = classify_nonstarters(next_ch); | |
40 | if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS { | |
6a06907d XL |
41 | // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing |
42 | // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the | |
43 | // iterator (via `self.buffer`), and we'll reclassify it next iteration. | |
44 | self.nonstarter_count = 0; | |
8faf50e0 | 45 | self.buffer = Some(next_ch); |
8faf50e0 XL |
46 | return Some(COMBINING_GRAPHEME_JOINER); |
47 | } | |
48 | ||
6a06907d XL |
49 | // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous |
50 | // nonstarters in NKFD. | |
8faf50e0 XL |
51 | if d.leading_nonstarters == d.decomposition_len { |
52 | self.nonstarter_count += d.decomposition_len; | |
53 | } | |
6a06907d | 54 | // Otherwise, reset the counter to the decomposition's number of trailing nonstarters. |
8faf50e0 XL |
55 | else { |
56 | self.nonstarter_count = d.trailing_nonstarters; | |
57 | } | |
58 | Some(next_ch) | |
59 | } | |
60 | } | |
61 | ||
62 | #[derive(Debug)] | |
63 | pub(crate) struct Decomposition { | |
64 | pub(crate) leading_nonstarters: usize, | |
65 | pub(crate) trailing_nonstarters: usize, | |
66 | pub(crate) decomposition_len: usize, | |
67 | } | |
68 | ||
69 | #[inline] | |
70 | pub(crate) fn classify_nonstarters(c: char) -> Decomposition { | |
71 | // As usual, fast path for ASCII (which is always a starter) | |
72 | if c <= '\x7f' { | |
73 | return Decomposition { | |
74 | leading_nonstarters: 0, | |
75 | trailing_nonstarters: 0, | |
76 | decomposition_len: 1, | |
f035d41b | 77 | }; |
8faf50e0 XL |
78 | } |
79 | // Next, special case Hangul, since it's not handled by our tables. | |
80 | if is_hangul_syllable(c) { | |
81 | return Decomposition { | |
82 | leading_nonstarters: 0, | |
83 | trailing_nonstarters: 0, | |
84 | decomposition_len: hangul_decomposition_length(c), | |
85 | }; | |
86 | } | |
f035d41b | 87 | let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); |
8faf50e0 | 88 | match decomp { |
f035d41b XL |
89 | Some(decomp) => Decomposition { |
90 | leading_nonstarters: stream_safe_leading_nonstarters(c), | |
91 | trailing_nonstarters: stream_safe_trailing_nonstarters(c), | |
92 | decomposition_len: decomp.len(), | |
8faf50e0 XL |
93 | }, |
94 | None => { | |
dfeec247 | 95 | let is_nonstarter = canonical_combining_class(c) != 0; |
8faf50e0 XL |
96 | let nonstarter = if is_nonstarter { 1 } else { 0 }; |
97 | Decomposition { | |
98 | leading_nonstarters: nonstarter, | |
99 | trailing_nonstarters: nonstarter, | |
100 | decomposition_len: 1, | |
101 | } | |
102 | } | |
103 | } | |
104 | } | |
105 | ||
106 | #[cfg(test)] | |
107 | mod tests { | |
f035d41b XL |
108 | use super::{classify_nonstarters, StreamSafe}; |
109 | use crate::lookups::canonical_combining_class; | |
110 | use crate::normalize::decompose_compatible; | |
111 | ||
112 | #[cfg(not(feature = "std"))] | |
113 | use crate::no_std_prelude::*; | |
114 | ||
115 | use core::char; | |
8faf50e0 XL |
116 | |
117 | fn stream_safe(s: &str) -> String { | |
118 | StreamSafe::new(s.chars()).collect() | |
119 | } | |
120 | ||
8faf50e0 XL |
121 | #[test] |
122 | fn test_simple() { | |
123 | let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone"; | |
124 | assert_eq!(stream_safe(technically_okay), technically_okay); | |
125 | ||
126 | let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone"; | |
5869c6ff XL |
127 | let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone"; |
128 | assert_eq!(stream_safe(too_much), fixed_it); | |
129 | ||
130 | let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone"; | |
131 | let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone"; | |
132 | assert_eq!(stream_safe(woah_nelly), its_cool); | |
8faf50e0 XL |
133 | } |
134 | ||
6a06907d XL |
135 | #[test] |
136 | fn test_all_nonstarters() { | |
137 | let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}"; | |
138 | let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}"; | |
139 | assert_eq!(stream_safe(s), expected); | |
140 | } | |
141 | ||
8faf50e0 XL |
142 | #[test] |
143 | fn test_classify_nonstarters() { | |
144 | // Highest character in the `compat_fully_decomp` table is 2FA1D | |
145 | for ch in 0..0x2FA1E { | |
146 | let ch = match char::from_u32(ch) { | |
147 | Some(c) => c, | |
148 | None => continue, | |
149 | }; | |
150 | let c = classify_nonstarters(ch); | |
f035d41b | 151 | let mut s = Vec::new(); |
8faf50e0 XL |
152 | decompose_compatible(ch, |c| s.push(c)); |
153 | ||
154 | assert_eq!(s.len(), c.decomposition_len); | |
155 | ||
156 | let num_leading = s | |
157 | .iter() | |
dfeec247 | 158 | .take_while(|&c| canonical_combining_class(*c) != 0) |
8faf50e0 XL |
159 | .count(); |
160 | let num_trailing = s | |
161 | .iter() | |
162 | .rev() | |
dfeec247 | 163 | .take_while(|&c| canonical_combining_class(*c) != 0) |
8faf50e0 XL |
164 | .count(); |
165 | ||
166 | assert_eq!(num_leading, c.leading_nonstarters); | |
167 | assert_eq!(num_trailing, c.trailing_nonstarters); | |
168 | } | |
169 | } | |
170 | } |