]> git.proxmox.com Git - rustc.git/blob - src/vendor/unicode-segmentation/src/grapheme.rs
New upstream version 1.19.0+dfsg1
[rustc.git] / src / vendor / unicode-segmentation / src / grapheme.rs
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12
13 use tables::grapheme::GraphemeCat;
14
15 /// External iterator for grapheme clusters and byte offsets.
16 #[derive(Clone)]
17 pub struct GraphemeIndices<'a> {
18 start_offset: usize,
19 iter: Graphemes<'a>,
20 }
21
22 impl<'a> Iterator for GraphemeIndices<'a> {
23 type Item = (usize, &'a str);
24
25 #[inline]
26 fn next(&mut self) -> Option<(usize, &'a str)> {
27 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
28 }
29
30 #[inline]
31 fn size_hint(&self) -> (usize, Option<usize>) {
32 self.iter.size_hint()
33 }
34 }
35
36 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
37 #[inline]
38 fn next_back(&mut self) -> Option<(usize, &'a str)> {
39 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
40 }
41 }
42
43 /// External iterator for a string's
44 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
45 #[derive(Clone)]
46 pub struct Graphemes<'a> {
47 string: &'a str,
48 extended: bool,
49 cat: Option<GraphemeCat>,
50 catb: Option<GraphemeCat>,
51 }
52
53 // state machine for cluster boundary rules
54 #[derive(PartialEq,Eq)]
55 enum GraphemeState {
56 Start,
57 FindExtend,
58 HangulL,
59 HangulLV,
60 HangulLVT,
61 Regional,
62 }
63
64 impl<'a> Iterator for Graphemes<'a> {
65 type Item = &'a str;
66
67 #[inline]
68 fn size_hint(&self) -> (usize, Option<usize>) {
69 let slen = self.string.len();
70 (cmp::min(slen, 1), Some(slen))
71 }
72
73 #[inline]
74 fn next(&mut self) -> Option<&'a str> {
75 use self::GraphemeState::*;
76 use tables::grapheme as gr;
77 if self.string.len() == 0 {
78 return None;
79 }
80
81 let mut take_curr = true;
82 let mut idx = 0;
83 let mut state = Start;
84 let mut cat = gr::GC_Any;
85 for (curr, ch) in self.string.char_indices() {
86 idx = curr;
87
88 // retrieve cached category, if any
89 // We do this because most of the time we would end up
90 // looking up each character twice.
91 cat = match self.cat {
92 None => gr::grapheme_category(ch),
93 _ => self.cat.take().unwrap()
94 };
95
96 if match cat {
97 gr::GC_Extend => true,
98 gr::GC_SpacingMark if self.extended => true,
99 _ => false
100 } {
101 state = FindExtend; // rule GB9/GB9a
102 continue;
103 }
104
105 state = match state {
106 Start if '\r' == ch => {
107 let slen = self.string.len();
108 let nidx = idx + 1;
109 if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' {
110 idx = nidx; // rule GB3
111 }
112 break; // rule GB4
113 }
114 Start => match cat {
115 gr::GC_Control => break,
116 gr::GC_L => HangulL,
117 gr::GC_LV | gr::GC_V => HangulLV,
118 gr::GC_LVT | gr::GC_T => HangulLVT,
119 gr::GC_Regional_Indicator => Regional,
120 _ => FindExtend
121 },
122 FindExtend => { // found non-extending when looking for extending
123 take_curr = false;
124 break;
125 },
126 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
127 gr::GC_L => continue,
128 gr::GC_LV | gr::GC_V => HangulLV,
129 gr::GC_LVT => HangulLVT,
130 _ => {
131 take_curr = false;
132 break;
133 }
134 },
135 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
136 gr::GC_V => continue,
137 gr::GC_T => HangulLVT,
138 _ => {
139 take_curr = false;
140 break;
141 }
142 },
143 HangulLVT => match cat { // rule GB8: (LVT|T) x T
144 gr::GC_T => continue,
145 _ => {
146 take_curr = false;
147 break;
148 }
149 },
150 Regional => match cat { // rule GB8a
151 gr::GC_Regional_Indicator => continue,
152 _ => {
153 take_curr = false;
154 break;
155 }
156 }
157 }
158 }
159
160 self.cat = if take_curr {
161 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
162 None
163 } else {
164 Some(cat)
165 };
166
167 let retstr = &self.string[..idx];
168 self.string = &self.string[idx..];
169 Some(retstr)
170 }
171 }
172
173 impl<'a> DoubleEndedIterator for Graphemes<'a> {
174 #[inline]
175 fn next_back(&mut self) -> Option<&'a str> {
176 use self::GraphemeState::*;
177 use tables::grapheme as gr;
178 if self.string.len() == 0 {
179 return None;
180 }
181
182 let mut take_curr = true;
183 let mut idx = self.string.len();
184 let mut previdx = idx;
185 let mut state = Start;
186 let mut cat = gr::GC_Any;
187 for (curr, ch) in self.string.char_indices().rev() {
188 previdx = idx;
189 idx = curr;
190
191 // cached category, if any
192 cat = match self.catb {
193 None => gr::grapheme_category(ch),
194 _ => self.catb.take().unwrap()
195 };
196
197 // a matching state machine that runs *backwards* across an input string
198 // note that this has some implications for the Hangul matching, since
199 // we now need to know what the rightward letter is:
200 //
201 // Right to left, we have:
202 // L x L
203 // V x (L|V|LV)
204 // T x (V|T|LV|LVT)
205 // HangulL means the letter to the right is L
206 // HangulLV means the letter to the right is V
207 // HangulLVT means the letter to the right is T
208 state = match state {
209 Start if '\n' == ch => {
210 if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() {
211 idx -= 1; // rule GB3
212 }
213 break; // rule GB4
214 },
215 Start | FindExtend => match cat {
216 gr::GC_Extend => FindExtend,
217 gr::GC_SpacingMark if self.extended => FindExtend,
218 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
219 gr::GC_V => HangulLV,
220 gr::GC_T => HangulLVT,
221 gr::GC_Regional_Indicator => Regional,
222 gr::GC_Control => {
223 take_curr = Start == state;
224 break;
225 },
226 _ => break
227 },
228 HangulL => match cat { // char to right is an L
229 gr::GC_L => continue, // L x L is the only legal match
230 _ => {
231 take_curr = false;
232 break;
233 }
234 },
235 HangulLV => match cat { // char to right is a V
236 gr::GC_V => continue, // V x V, right char is still V
237 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
238 _ => {
239 take_curr = false;
240 break;
241 }
242 },
243 HangulLVT => match cat { // char to right is a T
244 gr::GC_T => continue, // T x T, right char is still T
245 gr::GC_V => HangulLV, // V x T, right char is now V
246 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
247 _ => {
248 take_curr = false;
249 break;
250 }
251 },
252 Regional => match cat { // rule GB8a
253 gr::GC_Regional_Indicator => continue,
254 _ => {
255 take_curr = false;
256 break;
257 }
258 }
259 }
260 }
261
262 self.catb = if take_curr {
263 None
264 } else {
265 idx = previdx;
266 Some(cat)
267 };
268
269 let retstr = &self.string[idx..];
270 self.string = &self.string[..idx];
271 Some(retstr)
272 }
273 }
274
275 #[inline]
276 pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
277 Graphemes { string: s, extended: is_extended, cat: None, catb: None }
278 }
279
280 #[inline]
281 pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
282 GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
283 }