use regex_automata::DFA;
-use bstr::BStr;
+use ext_slice::ByteSlice;
use unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD;
use unicode::fsm::word_break_fwd::WORD_BREAK_FWD;
use utf8;
/// An iterator over words in a byte string.
///
/// This iterator is typically constructed by
-/// [`bstr::words`](struct.BStr.html#method.words).
+/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
///
/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
/// iterator, except it only returns elements that contain a "word" character.
pub struct Words<'a>(WordsWithBreaks<'a>);
impl<'a> Words<'a> {
- pub(crate) fn new(bs: &'a BStr) -> Words<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
Words(WordsWithBreaks::new(bs))
}
/// # Examples
///
/// ```
- /// use bstr::B;
+ /// use bstr::ByteSlice;
///
- /// let mut it = B("foo bar baz").words();
+ /// let mut it = b"foo bar baz".words();
///
- /// assert_eq!("foo bar baz", it.as_bstr());
+ /// assert_eq!(b"foo bar baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!(" baz", it.as_bstr());
+ /// assert_eq!(b" baz", it.as_bytes());
/// it.next();
- /// assert_eq!("", it.as_bstr());
+ /// assert_eq!(b"", it.as_bytes());
/// ```
#[inline]
- pub fn as_bstr(&self) -> &'a BStr {
- self.0.as_bstr()
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.0.as_bytes()
}
}
/// An iterator over words in a byte string and their byte index positions.
///
/// This iterator is typically constructed by
-/// [`bstr::word_indices`](struct.BStr.html#method.word_indices).
+/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
///
/// This is similar to the
/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
impl<'a> WordIndices<'a> {
- pub(crate) fn new(bs: &'a BStr) -> WordIndices<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
WordIndices(WordsWithBreakIndices::new(bs))
}
/// # Examples
///
/// ```
- /// use bstr::B;
+ /// use bstr::ByteSlice;
///
- /// let mut it = B("foo bar baz").word_indices();
+ /// let mut it = b"foo bar baz".word_indices();
///
- /// assert_eq!("foo bar baz", it.as_bstr());
+ /// assert_eq!(b"foo bar baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!(" baz", it.as_bstr());
+ /// assert_eq!(b" baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!("", it.as_bstr());
+ /// assert_eq!(b"", it.as_bytes());
/// ```
#[inline]
- pub fn as_bstr(&self) -> &'a BStr {
- self.0.as_bstr()
+ pub fn as_bytes(&self) -> &'a [u8] {
+ self.0.as_bytes()
}
}
/// An iterator over all word breaks in a byte string.
///
/// This iterator is typically constructed by
-/// [`bstr::words_with_breaks`](struct.BStr.html#method.words_with_breaks).
+/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
///
/// This iterator yields not only all words, but the content that comes between
/// words. In particular, if all elements yielded by this iterator are
/// that do not use spaces between words.
#[derive(Clone, Debug)]
pub struct WordsWithBreaks<'a> {
- bs: &'a BStr,
+ bs: &'a [u8],
}
impl<'a> WordsWithBreaks<'a> {
- pub(crate) fn new(bs: &'a BStr) -> WordsWithBreaks<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
WordsWithBreaks { bs }
}
/// # Examples
///
/// ```
- /// use bstr::B;
+ /// use bstr::ByteSlice;
///
- /// let mut it = B("foo bar baz").words_with_breaks();
+ /// let mut it = b"foo bar baz".words_with_breaks();
///
- /// assert_eq!("foo bar baz", it.as_bstr());
+ /// assert_eq!(b"foo bar baz", it.as_bytes());
/// it.next();
- /// assert_eq!(" bar baz", it.as_bstr());
+ /// assert_eq!(b" bar baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!(" baz", it.as_bstr());
+ /// assert_eq!(b" baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!("", it.as_bstr());
+ /// assert_eq!(b"", it.as_bytes());
/// ```
#[inline]
- pub fn as_bstr(&self) -> &'a BStr {
+ pub fn as_bytes(&self) -> &'a [u8] {
self.bs
}
}
/// index positions.
///
/// This iterator is typically constructed by
-/// [`bstr::words_with_break_indices`](struct.BStr.html#method.words_with_break_indices).
+/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
///
/// This iterator yields not only all words, but the content that comes between
/// words. In particular, if all elements yielded by this iterator are
/// that do not use spaces between words.
#[derive(Clone, Debug)]
pub struct WordsWithBreakIndices<'a> {
- bs: &'a BStr,
+ bs: &'a [u8],
forward_index: usize,
}
impl<'a> WordsWithBreakIndices<'a> {
- pub(crate) fn new(bs: &'a BStr) -> WordsWithBreakIndices<'a> {
+ pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
WordsWithBreakIndices { bs: bs, forward_index: 0 }
}
/// # Examples
///
/// ```
- /// use bstr::B;
+ /// use bstr::ByteSlice;
///
- /// let mut it = B("foo bar baz").words_with_break_indices();
+ /// let mut it = b"foo bar baz".words_with_break_indices();
///
- /// assert_eq!("foo bar baz", it.as_bstr());
+ /// assert_eq!(b"foo bar baz", it.as_bytes());
/// it.next();
- /// assert_eq!(" bar baz", it.as_bstr());
+ /// assert_eq!(b" bar baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!(" baz", it.as_bstr());
+ /// assert_eq!(b" baz", it.as_bytes());
/// it.next();
/// it.next();
- /// assert_eq!("", it.as_bstr());
+ /// assert_eq!(b"", it.as_bytes());
/// ```
#[inline]
- pub fn as_bstr(&self) -> &'a BStr {
+ pub fn as_bytes(&self) -> &'a [u8] {
self.bs
}
}
}
}
-fn decode_word(bs: &BStr) -> (&str, usize) {
+fn decode_word(bs: &[u8]) -> (&str, usize) {
if bs.is_empty() {
("", 0)
- } else if let Some(end) = WORD_BREAK_FWD.find(bs.as_bytes()) {
+ } else if let Some(end) = WORD_BREAK_FWD.find(bs) {
// Safe because a match can only occur for valid UTF-8.
let word = unsafe { bs[..end].to_str_unchecked() };
(word, word.len())
} else {
const INVALID: &'static str = "\u{FFFD}";
// No match on non-empty bytes implies we found invalid UTF-8.
- let (_, size) = utf8::decode_lossy(bs.as_bytes());
+ let (_, size) = utf8::decode_lossy(bs);
(INVALID, size)
}
}
mod tests {
use ucd_parse::WordBreakTest;
- use bstr::BStr;
+ use ext_slice::ByteSlice;
#[test]
fn forward_ucd() {
test.words,
got,
"\n\nword forward break test {} failed:\n\
- given: {:?}\n\
- expected: {:?}\n\
- got: {:?}\n",
+ given: {:?}\n\
+ expected: {:?}\n\
+ got: {:?}\n",
i,
- BStr::new(&given),
+ given,
strs_to_bstrs(&test.words),
strs_to_bstrs(&got),
);
#[test]
fn forward_additional() {
assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y"));
- assert_eq!(
- vec!["r", ".", " ", "Yo"],
- words(b"r. Yo")
- );
+ assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo"));
assert_eq!(
vec!["whatsoever", ".", " ", "You", " ", "may"],
words(b"whatsoever. You may")
words(b"21stcentury'syesterday")
);
- assert_eq!(
- vec!["Bonta_", "'", "s"],
- words(b"Bonta_'s")
- );
- assert_eq!(
- vec!["_vhat's"],
- words(b"_vhat's")
- );
- assert_eq!(
- vec!["__on'anima"],
- words(b"__on'anima")
- );
- assert_eq!(
- vec!["123_", "'", "4"],
- words(b"123_'4")
- );
- assert_eq!(
- vec!["_123'4"],
- words(b"_123'4")
- );
- assert_eq!(
- vec!["__12'345"],
- words(b"__12'345")
- );
+ assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
+ assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
+ assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
+ assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
+ assert_eq!(vec!["_123'4"], words(b"_123'4"));
+ assert_eq!(vec!["__12'345"], words(b"__12'345"));
assert_eq!(
vec!["tomorrowat4", ":", "00", ","],
words(b"tomorrowat4:00,")
);
- assert_eq!(
- vec!["RS1", "'", "s"],
- words(b"RS1's")
- );
- assert_eq!(
- vec!["X38"],
- words(b"X38")
- );
+ assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
+ assert_eq!(vec!["X38"], words(b"X38"));
- assert_eq!(
- vec!["4abc", ":", "00", ","],
- words(b"4abc:00,")
- );
- assert_eq!(
- vec!["12S", "'", "1"],
- words(b"12S'1")
- );
- assert_eq!(
- vec!["1XY"],
- words(b"1XY")
- );
+ assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
+ assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
+ assert_eq!(vec!["1XY"], words(b"1XY"));
- assert_eq!(
- vec!["\u{FEFF}", "Ты"],
- words("\u{FEFF}Ты".as_bytes())
- );
+ assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
}
fn words(bytes: &[u8]) -> Vec<&str> {
- BStr::new(bytes).words_with_breaks().collect()
+ bytes.words_with_breaks().collect()
}
- fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&BStr> {
- strs.iter().map(|s| BStr::new(s.as_ref())).collect()
+ fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
+ strs.iter().map(|s| s.as_ref().as_bytes()).collect()
}
/// Return all of the UCD for word breaks.
fn ucdtests() -> Vec<WordBreakTest> {
- const TESTDATA: &'static str = include_str!(
- "data/WordBreakTest.txt"
- );
+ const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
let mut tests = vec![];
for mut line in TESTDATA.lines() {