]> git.proxmox.com Git - rustc.git/blame - src/vendor/unicode-segmentation/src/word.rs
New upstream version 1.19.0+dfsg1
[rustc.git] / src / vendor / unicode-segmentation / src / word.rs
CommitLineData
8bb4bdeb
XL
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22pub struct UnicodeWords<'a> {
23 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
24}
25
26impl<'a> Iterator for UnicodeWords<'a> {
27 type Item = &'a str;
28
29 #[inline]
30 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
31}
32impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
33 #[inline]
34 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
35}
36
37/// External iterator for a string's
38/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
39#[derive(Clone)]
40pub struct UWordBounds<'a> {
41 string: &'a str,
42 cat: Option<WordCat>,
43 catb: Option<WordCat>,
44}
45
46/// External iterator for word boundaries and byte offsets.
47#[derive(Clone)]
48pub struct UWordBoundIndices<'a> {
49 start_offset: usize,
50 iter: UWordBounds<'a>,
51}
52
8bb4bdeb
XL
53impl<'a> Iterator for UWordBoundIndices<'a> {
54 type Item = (usize, &'a str);
55
56 #[inline]
57 fn next(&mut self) -> Option<(usize, &'a str)> {
58 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
59 }
60
61 #[inline]
62 fn size_hint(&self) -> (usize, Option<usize>) {
63 self.iter.size_hint()
64 }
65}
66
67impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
68 #[inline]
69 fn next_back(&mut self) -> Option<(usize, &'a str)> {
70 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
71 }
72}
73
74// state machine for word boundary rules
7cac9316 75#[derive(Clone,Copy,PartialEq,Eq)]
8bb4bdeb
XL
76enum UWordBoundsState {
77 Start,
78 Letter,
79 HLetter,
80 Numeric,
81 Katakana,
82 ExtendNumLet,
7cac9316 83 Regional,
8bb4bdeb 84 FormatExtend(FormatExtendType),
8bb4bdeb
XL
85}
86
87// subtypes for FormatExtend state in UWordBoundsState
7cac9316 88#[derive(Clone,Copy,PartialEq,Eq)]
8bb4bdeb
XL
89enum FormatExtendType {
90 AcceptAny,
91 AcceptNone,
92 RequireLetter,
93 RequireHLetter,
94 AcceptQLetter,
95 RequireNumeric,
96}
97
8bb4bdeb
XL
98impl<'a> Iterator for UWordBounds<'a> {
99 type Item = &'a str;
100
101 #[inline]
102 fn size_hint(&self) -> (usize, Option<usize>) {
103 let slen = self.string.len();
104 (cmp::min(slen, 1), Some(slen))
105 }
106
107 #[inline]
108 fn next(&mut self) -> Option<&'a str> {
109 use self::UWordBoundsState::*;
110 use self::FormatExtendType::*;
111 use tables::word as wd;
112 if self.string.len() == 0 {
113 return None;
114 }
115
116 let mut take_curr = true;
117 let mut take_cat = true;
118 let mut idx = 0;
119 let mut saveidx = 0;
120 let mut state = Start;
121 let mut cat = wd::WC_Any;
122 let mut savecat = wd::WC_Any;
8bb4bdeb
XL
123 for (curr, ch) in self.string.char_indices() {
124 idx = curr;
7cac9316 125
8bb4bdeb
XL
126 // if there's a category cached, grab it
127 cat = match self.cat {
128 None => wd::word_category(ch),
129 _ => self.cat.take().unwrap()
130 };
131 take_cat = true;
132
133 // handle rule WB4
7cac9316 134 // just skip all format and extend chars
8bb4bdeb
XL
135 // note that Start is a special case: if there's a bunch of Format | Extend
136 // characters at the beginning of a block of text, dump them out as one unit.
137 //
138 // (This is not obvious from the wording of UAX#29, but if you look at the
139 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
140 // then the "correct" interpretation of WB4 becomes apparent.)
7cac9316
XL
141 if state != Start && (cat == wd::WC_Extend || cat == wd::WC_Format) {
142 continue;
8bb4bdeb
XL
143 }
144
8bb4bdeb
XL
145 state = match state {
146 Start if cat == wd::WC_CR => {
147 idx += match self.get_next_cat(idx) {
148 Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
149 _ => 0
150 };
151 break; // rule WB3a
152 },
153 Start => match cat {
154 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
155 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
156 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
157 wd::WC_Katakana => Katakana, // rule WB13, WB13a
158 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
7cac9316 159 wd::WC_Regional_Indicator => Regional, // rule WB13c
8bb4bdeb 160 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
8bb4bdeb
XL
161 _ => {
162 if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
7cac9316 163 if ncat == wd::WC_Format || ncat == wd::WC_Extend {
8bb4bdeb
XL
164 state = FormatExtend(AcceptNone);
165 self.cat = Some(ncat);
166 continue;
167 }
168 }
7cac9316 169 break; // rule WB14
8bb4bdeb
XL
170 }
171 },
8bb4bdeb
XL
172 Letter | HLetter => match cat {
173 wd::WC_ALetter => Letter, // rule WB5
174 wd::WC_Hebrew_Letter => HLetter, // rule WB5
175 wd::WC_Numeric => Numeric, // rule WB9
176 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
177 wd::WC_Double_Quote if state == HLetter => {
178 savecat = cat;
179 saveidx = idx;
180 FormatExtend(RequireHLetter) // rule WB7b
181 },
182 wd::WC_Single_Quote if state == HLetter => {
183 FormatExtend(AcceptQLetter) // rule WB7a
184 },
185 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
186 savecat = cat;
187 saveidx = idx;
188 FormatExtend(RequireLetter) // rule WB6
189 },
190 _ => {
191 take_curr = false;
192 break;
193 }
194 },
195 Numeric => match cat {
196 wd::WC_Numeric => Numeric, // rule WB8
197 wd::WC_ALetter => Letter, // rule WB10
198 wd::WC_Hebrew_Letter => HLetter, // rule WB10
199 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
200 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
201 savecat = cat;
202 saveidx = idx;
203 FormatExtend(RequireNumeric) // rule WB12
204 },
205 _ => {
206 take_curr = false;
207 break;
208 }
209 },
210 Katakana => match cat {
211 wd::WC_Katakana => Katakana, // rule WB13
212 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
213 _ => {
214 take_curr = false;
215 break;
216 }
217 },
218 ExtendNumLet => match cat {
219 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
220 wd::WC_ALetter => Letter, // rule WB13b
221 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
222 wd::WC_Numeric => Numeric, // rule WB13b
223 wd::WC_Katakana => Katakana, // rule WB13b
224 _ => {
225 take_curr = false;
226 break;
227 }
228 },
7cac9316
XL
229 Regional => match cat {
230 wd::WC_Regional_Indicator => Regional, // rule WB13c
8bb4bdeb
XL
231 _ => {
232 take_curr = false;
233 break;
234 }
235 },
236 FormatExtend(t) => match t { // handle FormatExtends depending on what type
237 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
238 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
239 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
240 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
241 AcceptNone | AcceptQLetter => {
242 take_curr = false; // emit all the Format|Extend characters
243 take_cat = false;
244 break;
245 },
246 _ => break // rewind (in if statement below)
247 }
248 }
249 }
250
251 if let FormatExtend(t) = state {
252 // we were looking for something and didn't find it; we have to back up
253 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
254 idx = saveidx;
255 cat = savecat;
256 take_curr = false;
257 }
258 }
259
260 self.cat = if take_curr {
261 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
262 None
263 } else if take_cat {
264 Some(cat)
265 } else {
266 None
267 };
268
269 let retstr = &self.string[..idx];
270 self.string = &self.string[idx..];
271 Some(retstr)
272 }
273}
274
275impl<'a> DoubleEndedIterator for UWordBounds<'a> {
276 #[inline]
277 fn next_back(&mut self) -> Option<&'a str> {
278 use self::UWordBoundsState::*;
279 use self::FormatExtendType::*;
280 use tables::word as wd;
281 if self.string.len() == 0 {
282 return None;
283 }
284
285 let mut take_curr = true;
286 let mut take_cat = true;
287 let mut idx = self.string.len();
288 idx -= self.string.chars().next_back().unwrap().len_utf8();
289 let mut previdx = idx;
290 let mut saveidx = idx;
291 let mut state = Start;
292 let mut savestate = Start;
293 let mut cat = wd::WC_Any;
8bb4bdeb
XL
294 for (curr, ch) in self.string.char_indices().rev() {
295 previdx = idx;
296 idx = curr;
297
298 // if there's a category cached, grab it
299 cat = match self.catb {
300 None => wd::word_category(ch),
301 _ => self.catb.take().unwrap()
302 };
303 take_cat = true;
304
305 // backward iterator over word boundaries. Mostly the same as the forward
306 // iterator, with two weirdnesses:
307 // (1) If we encounter a single quote in the Start state, we have to check for a
308 // Hebrew Letter immediately before it.
309 // (2) Format and Extend char handling takes some gymnastics.
310
7cac9316 311 if cat == wd::WC_Extend || cat == wd::WC_Format {
8bb4bdeb
XL
312 if match state {
313 FormatExtend(_) | Start => false,
314 _ => true
315 } {
316 saveidx = previdx;
317 savestate = state;
318 state = FormatExtend(AcceptNone);
319 }
320
321 if state != Start {
322 continue;
323 }
324 } else if state == FormatExtend(AcceptNone) {
325 // finished a scan of some Format|Extend chars, restore previous state
326 state = savestate;
327 previdx = saveidx;
328 take_cat = false;
329 }
330
8bb4bdeb
XL
331 state = match state {
332 Start | FormatExtend(AcceptAny) => match cat {
333 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
334 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
335 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
336 wd::WC_Katakana => Katakana, // rule WB13, WB13b
337 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
7cac9316
XL
338 wd::WC_Regional_Indicator => Regional, // rule WB13c
339 wd::WC_Extend | wd::WC_Format => FormatExtend(AcceptAny), // rule WB4
8bb4bdeb
XL
340 wd::WC_Single_Quote => {
341 saveidx = idx;
342 FormatExtend(AcceptQLetter) // rule WB7a
343 },
8bb4bdeb
XL
344 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
345 if state == Start {
346 if cat == wd::WC_LF {
347 idx -= match self.get_prev_cat(idx) {
348 Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
349 _ => 0
350 };
351 }
352 } else {
353 take_curr = false;
354 }
355 break; // rule WB3a
356 },
7cac9316 357 _ => break // rule WB14
8bb4bdeb
XL
358 },
359 Letter | HLetter => match cat {
360 wd::WC_ALetter => Letter, // rule WB5
361 wd::WC_Hebrew_Letter => HLetter, // rule WB5
362 wd::WC_Numeric => Numeric, // rule WB10
363 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
364 wd::WC_Double_Quote if state == HLetter => {
365 saveidx = previdx;
366 FormatExtend(RequireHLetter) // rule WB7c
367 },
368 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
369 saveidx = previdx;
370 FormatExtend(RequireLetter) // rule WB7
371 },
372 _ => {
373 take_curr = false;
374 break;
375 }
376 },
377 Numeric => match cat {
378 wd::WC_Numeric => Numeric, // rule WB8
379 wd::WC_ALetter => Letter, // rule WB9
380 wd::WC_Hebrew_Letter => HLetter, // rule WB9
381 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
382 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
383 saveidx = previdx;
384 FormatExtend(RequireNumeric) // rule WB11
385 },
386 _ => {
387 take_curr = false;
388 break;
389 }
390 },
391 Katakana => match cat {
392 wd::WC_Katakana => Katakana, // rule WB13
393 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
394 _ => {
395 take_curr = false;
396 break;
397 }
398 },
399 ExtendNumLet => match cat {
400 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
401 wd::WC_ALetter => Letter, // rule WB13a
402 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
403 wd::WC_Numeric => Numeric, // rule WB13a
404 wd::WC_Katakana => Katakana, // rule WB13a
405 _ => {
406 take_curr = false;
407 break;
408 }
409 },
7cac9316
XL
410 Regional => match cat {
411 wd::WC_Regional_Indicator => Regional, // rule WB13c
8bb4bdeb
XL
412 _ => {
413 take_curr = false;
414 break;
415 }
416 },
417 FormatExtend(t) => match t {
418 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
419 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
420 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
421 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
422 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
423 _ => break // backtrack will happens
424 }
425 }
426 }
427
428 if let FormatExtend(t) = state {
429 // if we required something but didn't find it, backtrack
430 if t == RequireLetter || t == RequireHLetter ||
431 t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
432 previdx = saveidx;
433 take_cat = false;
434 take_curr = false;
435 }
436 }
437
438 self.catb = if take_curr {
439 None
440 } else {
441 idx = previdx;
442 if take_cat {
443 Some(cat)
444 } else {
445 None
446 }
447 };
448
449 let retstr = &self.string[idx..];
450 self.string = &self.string[..idx];
451 Some(retstr)
452 }
453}
454
455impl<'a> UWordBounds<'a> {
8bb4bdeb
XL
456 #[inline]
457 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
458 use tables::word as wd;
459 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
460 if nidx < self.string.len() {
461 let nch = self.string[nidx..].chars().next().unwrap();
462 Some(wd::word_category(nch))
463 } else {
464 None
465 }
466 }
467
468 #[inline]
469 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
470 use tables::word as wd;
471 if idx > 0 {
472 let nch = self.string[..idx].chars().next_back().unwrap();
473 Some(wd::word_category(nch))
474 } else {
475 None
476 }
477 }
478}
479
480#[inline]
481pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
482 UWordBounds { string: s, cat: None, catb: None }
483}
484
485#[inline]
486pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
487 UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
488}
489
490#[inline]
491pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
492 use super::UnicodeSegmentation;
493 use tables::util::is_alphanumeric;
494
495 fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
496 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
497
498 UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
499}