[rustc.git] / vendor / unicode-segmentation / src / sentence.rs

// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use core::cmp;
use core::iter::Filter;

// All of the logic for forward iteration over sentences
mod fwd {
    use crate::tables::sentence::SentenceCat;
    use core::cmp;

    // Describe a parsed part of source string as described in this table:
    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
    #[derive(Clone, Copy, PartialEq, Eq)]
    enum StatePart {
        Sot,
        Eot,
        Other,
        CR,
        LF,
        Sep,
        ATerm,
        UpperLower,
        ClosePlus,
        SpPlus,
        STerm
    }

    #[derive(Clone, PartialEq, Eq)]
    struct SentenceBreaksState(pub [StatePart; 4]);

    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
        StatePart::Sot,
        StatePart::Sot,
        StatePart::Sot,
        StatePart::Sot
    ]);

    #[derive(Clone)]
    pub struct SentenceBreaks<'a> {
        pub string: &'a str,
        pos: usize,
        state: SentenceBreaksState
    }

    impl SentenceBreaksState {
        // Attempt to advance the internal state by one part
        // Whitespace and some punctutation will be collapsed
        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
            let &SentenceBreaksState(parts) = self;
            let parts = match (parts[3], cat) {
                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
                _ => [
                    parts[1],
                    parts[2],
                    parts[3],
                    match cat {
                        SentenceCat::SC_CR => StatePart::CR,
                        SentenceCat::SC_LF => StatePart::LF,
                        SentenceCat::SC_Sep => StatePart::Sep,
                        SentenceCat::SC_ATerm => StatePart::ATerm,
                        SentenceCat::SC_Upper |
                        SentenceCat::SC_Lower => StatePart::UpperLower,
                        SentenceCat::SC_Close => StatePart::ClosePlus,
                        SentenceCat::SC_Sp => StatePart::SpPlus,
                        SentenceCat::SC_STerm => StatePart::STerm,
                        _ => StatePart::Other
                    }
                ]
            };
            SentenceBreaksState(parts)
        }

        fn end(&self) -> SentenceBreaksState {
            let &SentenceBreaksState(parts) = self;
            SentenceBreaksState([
                parts[1],
                parts[2],
                parts[3],
                StatePart::Eot
            ])
        }

        // Helper function to check if state head matches a single `StatePart`
        fn match1(&self, part: StatePart) -> bool {
            let &SentenceBreaksState(parts) = self;
            part == parts[3]
        }

        // Helper function to check if first two `StateParts` in state match
        // the given two
        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
            let &SentenceBreaksState(parts) = self;
            part1 == parts[2] && part2 == parts[3]
        }
    }

    // https://unicode.org/reports/tr29/#SB8
    // TODO cache this, it is currently quadratic
    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
        let &SentenceBreaksState(parts) = state;
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
        if parts[idx] == StatePart::ClosePlus { idx -= 1 }

        if parts[idx] == StatePart::ATerm {
            use crate::tables::sentence as se;

            for next_char in ahead.chars() {
                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
                match se::sentence_category(next_char).2 {
                    se::SC_Lower => return true,
                    se::SC_OLetter |
                    se::SC_Upper |
                    se::SC_Sep | se::SC_CR | se::SC_LF |
                    se::SC_STerm | se::SC_ATerm => return false,
                    _ => continue
                }
            }
        }

        false
    }

    // https://unicode.org/reports/tr29/#SB8a
    fn match_sb8a(state: &SentenceBreaksState) -> bool {
        // SATerm Close* Sp*
        let &SentenceBreaksState(parts) = state;
        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
        if parts[idx] == StatePart::ClosePlus { idx -= 1 }
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    // https://unicode.org/reports/tr29/#SB9
    fn match_sb9(state: &SentenceBreaksState) -> bool {
        // SATerm Close*
        let &SentenceBreaksState(parts) = state;
        let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    // https://unicode.org/reports/tr29/#SB11
    fn match_sb11(state: &SentenceBreaksState) -> bool {
        // SATerm Close* Sp* ParaSep?
        let &SentenceBreaksState(parts) = state;
        let mut idx = match parts[3] {
            StatePart::Sep |
            StatePart::CR |
            StatePart::LF => 2,
            _ => 3
        };

        if parts[idx] == StatePart::SpPlus { idx -= 1 }
        if parts[idx] == StatePart::ClosePlus { idx -= 1}

        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
    }

    impl<'a> Iterator for SentenceBreaks<'a> {
        // Returns the index of the character which follows a break
        type Item = usize;

        #[inline]
        fn size_hint(&self) -> (usize, Option<usize>) {
            let slen = self.string.len();
            // A sentence could be one character
            (cmp::min(slen, 2), Some(slen + 1))
        }

        #[inline]
        fn next(&mut self) -> Option<usize> {
            use crate::tables::sentence as se;

            for next_char in self.string[self.pos..].chars() {
                let position_before = self.pos;
                let state_before = self.state.clone();

                let next_cat = se::sentence_category(next_char).2;

                self.pos += next_char.len_utf8();
                self.state = self.state.next(next_cat);

                match next_cat {
                    // SB1 https://unicode.org/reports/tr29/#SB1
                    _ if state_before.match1(StatePart::Sot) =>
                        return Some(position_before),

                    // SB2 is handled when inner iterator (chars) is finished

                    // SB3 https://unicode.org/reports/tr29/#SB3
                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
                        continue,

                    // SB4 https://unicode.org/reports/tr29/#SB4
                    _ if state_before.match1(StatePart::Sep)
                        || state_before.match1(StatePart::CR)
                        || state_before.match1(StatePart::LF)
                    => return Some(position_before),

                    // SB5 https://unicode.org/reports/tr29/#SB5
                    SentenceCat::SC_Extend |
                    SentenceCat::SC_Format => self.state = state_before,

                    // SB6 https://unicode.org/reports/tr29/#SB6
                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
                        continue,

                    // SB7 https://unicode.org/reports/tr29/#SB7
                    SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
                        continue,

                    // SB8 https://unicode.org/reports/tr29/#SB8
                    _ if match_sb8(&state_before, &self.string[position_before..]) =>
                        continue,

                    // SB8a https://unicode.org/reports/tr29/#SB8a
                    SentenceCat::SC_SContinue |
                    SentenceCat::SC_STerm |
                    SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
                        continue,

                    // SB9 https://unicode.org/reports/tr29/#SB9
                    SentenceCat::SC_Close |
                    SentenceCat::SC_Sp |
                    SentenceCat::SC_Sep |
                    SentenceCat::SC_CR |
                    SentenceCat::SC_LF if match_sb9(&state_before) =>
                        continue,

                    // SB10 https://unicode.org/reports/tr29/#SB10
                    SentenceCat::SC_Sp |
                    SentenceCat::SC_Sep |
                    SentenceCat::SC_CR |
                    SentenceCat::SC_LF if match_sb8a(&state_before) =>
                        continue,

                    // SB11 https://unicode.org/reports/tr29/#SB11
                    _ if match_sb11(&state_before) =>
                        return Some(position_before),

                    // SB998 https://unicode.org/reports/tr29/#SB998
                    _ => continue
                }
            }

            // SB2 https://unicode.org/reports/tr29/#SB2
            if self.state.match1(StatePart::Sot) {
                None
            } else if self.state.match1(StatePart::Eot) {
                None
            } else {
                self.state = self.state.end();
                Some(self.pos)
            }
        }
    }

    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
        SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
    }

}

/// An iterator over the substrings of a string which, after splitting the string on
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UnicodeSentences<'a> {
    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
}

/// External iterator for a string's
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBounds<'a> {
    iter: fwd::SentenceBreaks<'a>,
    sentence_start: Option<usize>
}

/// External iterator for sentence boundaries and byte offsets.
///
/// This struct is created by the [`split_sentence_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBoundIndices<'a> {
    start_offset: usize,
    iter: USentenceBounds<'a>,
}

#[inline]
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
    USentenceBounds {
        iter: fwd::new_sentence_breaks(source),
        sentence_start: None
    }
}

#[inline]
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
    USentenceBoundIndices {
        start_offset: source.as_ptr() as usize,
        iter: new_sentence_bounds(source)
    }
}

#[inline]
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
    use super::UnicodeSegmentation;
    use crate::tables::util::is_alphanumeric;

    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer

    UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
}

impl<'a> Iterator for UnicodeSentences<'a> {
    type Item = &'a str;

    #[inline]
    fn next(&mut self) -> Option<&'a str> { self.inner.next() }
}

impl<'a> Iterator for USentenceBounds<'a> {
    type Item = &'a str;

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let (lower, upper) = self.iter.size_hint();
        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
    }

    #[inline]
    fn next(&mut self) -> Option<&'a str> {
        if self.sentence_start == None {
            if let Some(start_pos) = self.iter.next() {
                self.sentence_start = Some(start_pos)
            } else {
                return None
            }
        }

        if let Some(break_pos) = self.iter.next() {
            let start_pos = self.sentence_start.unwrap();
            let sentence = &self.iter.string[start_pos..break_pos];
            self.sentence_start = Some(break_pos);
            Some(sentence)
        } else {
            None
        }
    }
}

impl<'a> Iterator for USentenceBoundIndices<'a> {
    type Item = (usize, &'a str);

    #[inline]
    fn next(&mut self) -> Option<(usize, &'a str)> {
        self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}
Commit	Line	Data
60c5eb7d XL	1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	2	// file at the top-level directory of this distribution and at
	3	// http://rust-lang.org/COPYRIGHT.
	4	//
	5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	8	// option. This file may not be copied, modified, or distributed
	9	// except according to those terms.
	10
	11	use core::cmp;
	12	use core::iter::Filter;
	13
	14	// All of the logic for forward iteration over sentences
	15	mod fwd {
136023e0	16	use crate::tables::sentence::SentenceCat;
60c5eb7d XL	17	use core::cmp;
	18
	19	// Describe a parsed part of source string as described in this table:
	20	// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
	21	#[derive(Clone, Copy, PartialEq, Eq)]
	22	enum StatePart {
	23	Sot,
	24	Eot,
	25	Other,
	26	CR,
	27	LF,
	28	Sep,
	29	ATerm,
	30	UpperLower,
	31	ClosePlus,
	32	SpPlus,
	33	STerm
	34	}
	35
	36	#[derive(Clone, PartialEq, Eq)]
	37	struct SentenceBreaksState(pub [StatePart; 4]);
	38
	39	const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
	40	StatePart::Sot,
	41	StatePart::Sot,
	42	StatePart::Sot,
	43	StatePart::Sot
	44	]);
	45
	46	#[derive(Clone)]
	47	pub struct SentenceBreaks<'a> {
	48	pub string: &'a str,
	49	pos: usize,
	50	state: SentenceBreaksState
	51	}
	52
	53	impl SentenceBreaksState {
	54	// Attempt to advance the internal state by one part
	55	// Whitespace and some punctutation will be collapsed
	56	fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
	57	let &SentenceBreaksState(parts) = self;
	58	let parts = match (parts[3], cat) {
	59	(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
	60	(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
	61	_ => [
	62	parts[1],
	63	parts[2],
	64	parts[3],
	65	match cat {
	66	SentenceCat::SC_CR => StatePart::CR,
	67	SentenceCat::SC_LF => StatePart::LF,
	68	SentenceCat::SC_Sep => StatePart::Sep,
	69	SentenceCat::SC_ATerm => StatePart::ATerm,
	70	SentenceCat::SC_Upper \|
	71	SentenceCat::SC_Lower => StatePart::UpperLower,
	72	SentenceCat::SC_Close => StatePart::ClosePlus,
	73	SentenceCat::SC_Sp => StatePart::SpPlus,
	74	SentenceCat::SC_STerm => StatePart::STerm,
	75	_ => StatePart::Other
	76	}
	77	]
	78	};
	79	SentenceBreaksState(parts)
	80	}
81
82	fn end(&self) -> SentenceBreaksState {
83	let &SentenceBreaksState(parts) = self;
84	SentenceBreaksState([
85	parts[1],
86	parts[2],
87	parts[3],
88	StatePart::Eot
89	])
90	}
91
92	// Helper function to check if state head matches a single `StatePart`
93	fn match1(&self, part: StatePart) -> bool {
94	let &SentenceBreaksState(parts) = self;
95	part == parts[3]
96	}
97
98	// Helper function to check if first two `StateParts` in state match
99	// the given two
100	fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
101	let &SentenceBreaksState(parts) = self;
102	part1 == parts[2] && part2 == parts[3]
103	}
104	}
105
106	// https://unicode.org/reports/tr29/#SB8
107	// TODO cache this, it is currently quadratic
108	fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
109	let &SentenceBreaksState(parts) = state;
110	let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111	if parts[idx] == StatePart::ClosePlus { idx -= 1 }
112
113	if parts[idx] == StatePart::ATerm {
136023e0	114	use crate::tables::sentence as se;
60c5eb7d XL	115
	116	for next_char in ahead.chars() {
	117	//( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) )* Lower
5869c6ff	118	match se::sentence_category(next_char).2 {
60c5eb7d XL	119	se::SC_Lower => return true,
	120	se::SC_OLetter \|
	121	se::SC_Upper \|
	122	se::SC_Sep \| se::SC_CR \| se::SC_LF \|
	123	se::SC_STerm \| se::SC_ATerm => return false,
	124	_ => continue
	125	}
	126	}
	127	}
	128
	129	false
	130	}
	131
	132	// https://unicode.org/reports/tr29/#SB8a
	133	fn match_sb8a(state: &SentenceBreaksState) -> bool {
	134	// SATerm Close* Sp*
	135	let &SentenceBreaksState(parts) = state;
	136	let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
	137	if parts[idx] == StatePart::ClosePlus { idx -= 1 }
	138	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	139	}
	140
	141	// https://unicode.org/reports/tr29/#SB9
	142	fn match_sb9(state: &SentenceBreaksState) -> bool {
	143	// SATerm Close*
	144	let &SentenceBreaksState(parts) = state;
	145	let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
	146	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	147	}
	148
	149	// https://unicode.org/reports/tr29/#SB11
	150	fn match_sb11(state: &SentenceBreaksState) -> bool {
	151	// SATerm Close* Sp* ParaSep?
	152	let &SentenceBreaksState(parts) = state;
	153	let mut idx = match parts[3] {
	154	StatePart::Sep \|
	155	StatePart::CR \|
	156	StatePart::LF => 2,
	157	_ => 3
	158	};
	159
	160	if parts[idx] == StatePart::SpPlus { idx -= 1 }
	161	if parts[idx] == StatePart::ClosePlus { idx -= 1}
	162
	163	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
	164	}
	165
	166	impl<'a> Iterator for SentenceBreaks<'a> {
	167	// Returns the index of the character which follows a break
	168	type Item = usize;
	169
	170	#[inline]
	171	fn size_hint(&self) -> (usize, Option<usize>) {
	172	let slen = self.string.len();
	173	// A sentence could be one character
	174	(cmp::min(slen, 2), Some(slen + 1))
	175	}
	176
	177	#[inline]
	178	fn next(&mut self) -> Option<usize> {
136023e0	179	use crate::tables::sentence as se;
60c5eb7d XL	180
	181	for next_char in self.string[self.pos..].chars() {
	182	let position_before = self.pos;
	183	let state_before = self.state.clone();
	184
5869c6ff	185	let next_cat = se::sentence_category(next_char).2;
60c5eb7d XL	186
	187	self.pos += next_char.len_utf8();
	188	self.state = self.state.next(next_cat);
	189
	190	match next_cat {
	191	// SB1 https://unicode.org/reports/tr29/#SB1
	192	_ if state_before.match1(StatePart::Sot) =>
	193	return Some(position_before),
	194
	195	// SB2 is handled when inner iterator (chars) is finished
	196
	197	// SB3 https://unicode.org/reports/tr29/#SB3
	198	SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
	199	continue,
	200
	201	// SB4 https://unicode.org/reports/tr29/#SB4
	202	_ if state_before.match1(StatePart::Sep)
	203	\|\| state_before.match1(StatePart::CR)
	204	\|\| state_before.match1(StatePart::LF)
	205	=> return Some(position_before),
	206
	207	// SB5 https://unicode.org/reports/tr29/#SB5
	208	SentenceCat::SC_Extend \|
	209	SentenceCat::SC_Format => self.state = state_before,
	210
	211	// SB6 https://unicode.org/reports/tr29/#SB6
	212	SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
	213	continue,
	214
	215	// SB7 https://unicode.org/reports/tr29/#SB7
	216	SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
	217	continue,
	218
	219	// SB8 https://unicode.org/reports/tr29/#SB8
	220	_ if match_sb8(&state_before, &self.string[position_before..]) =>
	221	continue,
	222
	223	// SB8a https://unicode.org/reports/tr29/#SB8a
	224	SentenceCat::SC_SContinue \|
	225	SentenceCat::SC_STerm \|
	226	SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
	227	continue,
	228
	229	// SB9 https://unicode.org/reports/tr29/#SB9
	230	SentenceCat::SC_Close \|
	231	SentenceCat::SC_Sp \|
	232	SentenceCat::SC_Sep \|
	233	SentenceCat::SC_CR \|
	234	SentenceCat::SC_LF if match_sb9(&state_before) =>
	235	continue,
	236
	237	// SB10 https://unicode.org/reports/tr29/#SB10
	238	SentenceCat::SC_Sp \|
	239	SentenceCat::SC_Sep \|
	240	SentenceCat::SC_CR \|
	241	SentenceCat::SC_LF if match_sb8a(&state_before) =>
	242	continue,
	243
	244	// SB11 https://unicode.org/reports/tr29/#SB11
	245	_ if match_sb11(&state_before) =>
	246	return Some(position_before),
	247
	248	// SB998 https://unicode.org/reports/tr29/#SB998
	249	_ => continue
250	}
251	}
252
253	// SB2 https://unicode.org/reports/tr29/#SB2
254	if self.state.match1(StatePart::Sot) {
255	None
256	} else if self.state.match1(StatePart::Eot) {
257	None
258	} else {
259	self.state = self.state.end();
260	Some(self.pos)
261	}
262	}
263	}
264
265	pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
266	SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
267	}
268
269	}
270
271	/// An iterator over the substrings of a string which, after splitting the string on
272	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
273	/// contain any characters with the
274	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
275	/// property, or with
276	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
5869c6ff XL	277	///
	278	/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
	279	/// trait. See its documentation for more.
	280	///
	281	/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
	282	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d XL	283	#[derive(Clone)]
	284	pub struct UnicodeSentences<'a> {
	285	inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
	286	}
	287
	288	/// External iterator for a string's
	289	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
5869c6ff XL	290	///
	291	/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
	292	/// trait. See its documentation for more.
	293	///
	294	/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
	295	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d XL	296	#[derive(Clone)]
	297	pub struct USentenceBounds<'a> {
	298	iter: fwd::SentenceBreaks<'a>,
	299	sentence_start: Option<usize>
	300	}
	301
	302	/// External iterator for sentence boundaries and byte offsets.
5869c6ff XL	303	///
	304	/// This struct is created by the [`split_sentence_bound_indices`] method on the
	305	/// [`UnicodeSegmentation`] trait. See its documentation for more.
	306	///
	307	/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
	308	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60c5eb7d XL	309	#[derive(Clone)]
	310	pub struct USentenceBoundIndices<'a> {
	311	start_offset: usize,
	312	iter: USentenceBounds<'a>,
	313	}
	314
	315	#[inline]
	316	pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
	317	USentenceBounds {
	318	iter: fwd::new_sentence_breaks(source),
	319	sentence_start: None
	320	}
	321	}
	322
	323	#[inline]
	324	pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
	325	USentenceBoundIndices {
	326	start_offset: source.as_ptr() as usize,
	327	iter: new_sentence_bounds(source)
	328	}
	329	}
	330
	331	#[inline]
	332	pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
	333	use super::UnicodeSegmentation;
136023e0	334	use crate::tables::util::is_alphanumeric;
60c5eb7d XL	335
	336	fn has_alphanumeric(s: &&str) -> bool { s.chars().any(\|c\| is_alphanumeric(c)) }
	337	let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
	338
	339	UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
	340	}
	341
	342	impl<'a> Iterator for UnicodeSentences<'a> {
	343	type Item = &'a str;
	344
	345	#[inline]
	346	fn next(&mut self) -> Option<&'a str> { self.inner.next() }
	347	}
	348
	349	impl<'a> Iterator for USentenceBounds<'a> {
	350	type Item = &'a str;
	351
	352	#[inline]
	353	fn size_hint(&self) -> (usize, Option<usize>) {
	354	let (lower, upper) = self.iter.size_hint();
	355	(cmp::max(0, lower - 1), upper.map(\|u\| cmp::max(0, u - 1)))
	356	}
	357
	358	#[inline]
	359	fn next(&mut self) -> Option<&'a str> {
	360	if self.sentence_start == None {
	361	if let Some(start_pos) = self.iter.next() {
	362	self.sentence_start = Some(start_pos)
	363	} else {
	364	return None
	365	}
	366	}
	367
	368	if let Some(break_pos) = self.iter.next() {
	369	let start_pos = self.sentence_start.unwrap();
	370	let sentence = &self.iter.string[start_pos..break_pos];
	371	self.sentence_start = Some(break_pos);
	372	Some(sentence)
	373	} else {
	374	None
	375	}
	376	}
	377	}
	378
	379	impl<'a> Iterator for USentenceBoundIndices<'a> {
	380	type Item = (usize, &'a str);
	381
	382	#[inline]
	383	fn next(&mut self) -> Option<(usize, &'a str)> {
	384	self.iter.next().map(\|s\| (s.as_ptr() as usize - self.start_offset, s))
	385	}
	386
	387	#[inline]
	388	fn size_hint(&self) -> (usize, Option<usize>) {
	389	self.iter.size_hint()
	390	}
	391	}