vendor/unicode-segmentation/src/lib.rs

   1 // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
  12 //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
  13 //!
  14 //! ```rust
  15 //! extern crate unicode_segmentation;
  16 //!
  17 //! use unicode_segmentation::UnicodeSegmentation;
  18 //!
  19 //! fn main() {
  20 //!     let s = "a̐éö̲\r\n";
  21 //!     let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
  22 //!     let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
  23 //!     assert_eq!(g, b);
  24 //!
  25 //!     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
  26 //!     let w = s.unicode_words().collect::<Vec<&str>>();
  27 //!     let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
  28 //!     assert_eq!(w, b);
  29 //!
  30 //!     let s = "The quick (\"brown\")  fox";
  31 //!     let w = s.split_word_bounds().collect::<Vec<&str>>();
  32 //!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
  33 //!     assert_eq!(w, b);
  34 //! }
  35 //! ```
  36 //!
  37 //! # no_std
  38 //!
  39 //! unicode-segmentation does not depend on libstd, so it can be used in crates
  40 //! with the `#![no_std]` attribute.
  41 //!
  42 //! # crates.io
  43 //!
  44 //! You can use this package in your project by adding the following
  45 //! to your `Cargo.toml`:
  46 //!
  47 //! ```toml
  48 //! [dependencies]
  49 //! unicode-segmentation = "1.3.0"
  50 //! ```
  51
  52 #![deny(missing_docs, unsafe_code)]
  53 #![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
  54        html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
  55
  56 #![no_std]
  57
  58 #[cfg(test)]
  59 #[macro_use]
  60 extern crate std;
  61
  62 #[cfg(test)]
  63 #[macro_use]
  64 extern crate quickcheck;
  65
  66 pub use grapheme::{Graphemes, GraphemeIndices};
  67 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
  68 pub use tables::UNICODE_VERSION;
  69 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
  70 pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
  71
  72 mod grapheme;
  73 mod tables;
  74 mod word;
  75 mod sentence;
  76
  77 #[cfg(test)]
  78 mod test;
  79 #[cfg(test)]
  80 mod testdata;
  81
  82 /// Methods for segmenting strings according to
  83 /// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
  84 pub trait UnicodeSegmentation {
  85     /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
  86     ///
  87     /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
  88     ///
  89     /// If `is_extended` is true, the iterator is over the
  90     /// *extended grapheme clusters*;
  91     /// otherwise, the iterator is over the *legacy grapheme clusters*.
  92     /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
  93     /// recommends extended grapheme cluster boundaries for general processing.
  94     ///
  95     /// # Examples
  96     ///
  97     /// ```
  98     /// # use self::unicode_segmentation::UnicodeSegmentation;
  99     /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
 100     ///           .collect::<Vec<&str>>();
 101     /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
 102     ///
 103     /// assert_eq!(&gr1[..], b);
 104     ///
 105     /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
 106     /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
 107     ///
 108     /// assert_eq!(&gr2[..], b);
 109     /// ```
 110     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
 111
 112     /// Returns an iterator over the grapheme clusters of `self` and their
 113     /// byte offsets. See `graphemes()` for more information.
 114     ///
 115     /// # Examples
 116     ///
 117     /// ```
 118     /// # use self::unicode_segmentation::UnicodeSegmentation;
 119     /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
 120     ///               .collect::<Vec<(usize, &str)>>();
 121     /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
 122     ///
 123     /// assert_eq!(&gr_inds[..], b);
 124     /// ```
 125     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
 126
 127     /// Returns an iterator over the words of `self`, separated on
 128     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 129     ///
 130     /// Here, "words" are just those substrings which, after splitting on
 131     /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
 132     /// substring must contain at least one character with the
 133     /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
 134     /// property, or with
 135     /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
 136     ///
 137     /// # Example
 138     ///
 139     /// ```
 140     /// # use self::unicode_segmentation::UnicodeSegmentation;
 141     /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
 142     /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
 143     /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
 144     ///
 145     /// assert_eq!(&uw1[..], b);
 146     /// ```
 147     fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
 148
 149     /// Returns an iterator over substrings of `self` separated on
 150     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 151     ///
 152     /// The concatenation of the substrings returned by this function is just the original string.
 153     ///
 154     /// # Example
 155     ///
 156     /// ```
 157     /// # use self::unicode_segmentation::UnicodeSegmentation;
 158     /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
 159     /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
 160     ///
 161     /// assert_eq!(&swu1[..], b);
 162     /// ```
 163     fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
 164
 165     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
 166     /// and their offsets. See `split_word_bounds()` for more information.
 167     ///
 168     /// # Example
 169     ///
 170     /// ```
 171     /// # use self::unicode_segmentation::UnicodeSegmentation;
 172     /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
 173     /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
 174     ///                 (14, "°"), (16, "F"), (17, "!")];
 175     ///
 176     /// assert_eq!(&swi1[..], b);
 177     /// ```
 178     fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
 179
 180     /// Returns an iterator over substrings of `self` separated on
 181     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
 182     ///
 183     /// The concatenation of the substrings returned by this function is just the original string.
 184     fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
 185
 186     /// Returns an iterator over substrings of `self` separated on
 187     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
 188     ///
 189     /// Here, "sentences" are just those substrings which, after splitting on
 190     /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
 191     /// substring must contain at least one character with the
 192     /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
 193     /// property, or with
 194     /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
 195     fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
 196
 197     /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
 198     /// and their offsets. See `split_sentence_bounds()` for more information.
 199     fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
 200 }
 201
 202 impl UnicodeSegmentation for str {
 203     #[inline]
 204     fn graphemes(&self, is_extended: bool) -> Graphemes {
 205         grapheme::new_graphemes(self, is_extended)
 206     }
 207
 208     #[inline]
 209     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
 210         grapheme::new_grapheme_indices(self, is_extended)
 211     }
 212
 213     #[inline]
 214     fn unicode_words(&self) -> UnicodeWords {
 215         word::new_unicode_words(self)
 216     }
 217
 218     #[inline]
 219     fn split_word_bounds(&self) -> UWordBounds {
 220         word::new_word_bounds(self)
 221     }
 222
 223     #[inline]
 224     fn split_word_bound_indices(&self) -> UWordBoundIndices {
 225         word::new_word_bound_indices(self)
 226     }
 227
 228     #[inline]
 229     fn unicode_sentences(&self) -> UnicodeSentences {
 230         sentence::new_unicode_sentences(self)
 231     }
 232
 233     #[inline]
 234     fn split_sentence_bounds(&self) -> USentenceBounds {
 235         sentence::new_sentence_bounds(self)
 236     }
 237
 238     #[inline]
 239     fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
 240         sentence::new_sentence_bound_indices(self)
 241     }
 242 }