// option. This file may not be copied, modified, or distributed
// except according to those terms.
-//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
+//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
-//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
//!
//! ```toml
//! [dependencies]
-//! unicode-segmentation = "1.1.0"
+//! unicode-segmentation = "1.3.0"
//! ```
#![deny(missing_docs, unsafe_code)]
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
+pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
mod grapheme;
mod tables;
mod word;
+mod sentence;
#[cfg(test)]
mod test;
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
- /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+ /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
+
+ /// Returns an iterator over substrings of `self` separated on
+ /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+ ///
+ /// The concatenation of the substrings returned by this function is just the original string.
+ fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
+
+ /// Returns an iterator over substrings of `self` separated on
+ /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+ ///
+ /// Here, "sentences" are just those substrings which, after splitting on
+ /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
+ /// substring must contain at least one character with the
+ /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+ /// property, or with
+ /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+ fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
+
+ /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
+ /// and their offsets. See `split_sentence_bounds()` for more information.
+ fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}
impl UnicodeSegmentation for str {
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}
+
+ #[inline]
+ fn unicode_sentences(&self) -> UnicodeSentences {
+ sentence::new_unicode_sentences(self)
+ }
+
+ #[inline]
+ fn split_sentence_bounds(&self) -> USentenceBounds {
+ sentence::new_sentence_bounds(self)
+ }
+
+ #[inline]
+ fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
+ sentence::new_sentence_bound_indices(self)
+ }
}