]> git.proxmox.com Git - cargo.git/blob - vendor/bstr/src/ext_slice.rs
New upstream version 0.47.0
[cargo.git] / vendor / bstr / src / ext_slice.rs
1 #[cfg(feature = "std")]
2 use std::borrow::Cow;
3 #[cfg(feature = "std")]
4 use std::ffi::OsStr;
5 #[cfg(feature = "std")]
6 use std::path::Path;
7
8 use core::cmp;
9 use core::ops;
10 use core::ptr;
11 use core::slice;
12 use core::str;
13
14 use memchr::{memchr, memrchr};
15
16 use ascii;
17 use bstr::BStr;
18 use byteset;
19 #[cfg(feature = "std")]
20 use ext_vec::ByteVec;
21 use search::{PrefilterState, TwoWay};
22 #[cfg(feature = "unicode")]
23 use unicode::{
24 whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
25 SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
26 WordsWithBreaks,
27 };
28 use utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
29
30 /// A short-hand constructor for building a `&[u8]`.
31 ///
32 /// This idiosyncratic constructor is useful for concisely building byte string
33 /// slices. Its primary utility is in conveniently writing byte string literals
34 /// in a uniform way. For example, consider this code that does not compile:
35 ///
36 /// ```ignore
37 /// let strs = vec![b"a", b"xy"];
38 /// ```
39 ///
40 /// The above code doesn't compile because the type of the byte string literal
41 /// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is
42 /// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored
43 /// in the same `Vec`. (This is dissimilar from normal Unicode string slices,
44 /// where both `"a"` and `"xy"` have the same type of `&'static str`.)
45 ///
46 /// One way of getting the above code to compile is to convert byte strings to
47 /// slices. You might try this:
48 ///
49 /// ```ignore
50 /// let strs = vec![&b"a", &b"xy"];
51 /// ```
52 ///
53 /// But this just creates values with type `& &'static [u8; 1]` and
54 /// `& &'static [u8; 2]`. Instead, you need to force the issue like so:
55 ///
56 /// ```
57 /// let strs = vec![&b"a"[..], &b"xy"[..]];
58 /// // or
59 /// let strs = vec![b"a".as_ref(), b"xy".as_ref()];
60 /// ```
61 ///
62 /// But neither of these are particularly convenient to type, especially when
63 /// it's something as common as a string literal. Thus, this constructor
64 /// permits writing the following instead:
65 ///
66 /// ```
67 /// use bstr::B;
68 ///
69 /// let strs = vec![B("a"), B(b"xy")];
70 /// ```
71 ///
72 /// Notice that this also lets you mix and match both string literals and byte
73 /// string literals. This can be quite convenient!
74 #[allow(non_snake_case)]
75 #[inline]
76 pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] {
77 bytes.as_ref()
78 }
79
80 impl ByteSlice for [u8] {
81 #[inline]
82 fn as_bytes(&self) -> &[u8] {
83 self
84 }
85
86 #[inline]
87 fn as_bytes_mut(&mut self) -> &mut [u8] {
88 self
89 }
90 }
91
92 /// Ensure that callers cannot implement `ByteSlice` by making an
93 /// umplementable trait its super trait.
94 pub trait Sealed {}
95 impl Sealed for [u8] {}
96
97 /// A trait that extends `&[u8]` with string oriented methods.
98 pub trait ByteSlice: Sealed {
99 /// A method for accessing the raw bytes of this type. This is always a
100 /// no-op and callers shouldn't care about it. This only exists for making
101 /// the extension trait work.
102 #[doc(hidden)]
103 fn as_bytes(&self) -> &[u8];
104
105 /// A method for accessing the raw bytes of this type, mutably. This is
106 /// always a no-op and callers shouldn't care about it. This only exists
107 /// for making the extension trait work.
108 #[doc(hidden)]
109 fn as_bytes_mut(&mut self) -> &mut [u8];
110
111 /// Return this byte slice as a `&BStr`.
112 ///
113 /// Use `&BStr` is useful because of its `fmt::Debug` representation
114 /// and various other trait implementations (such as `PartialEq` and
115 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
116 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
117 /// sequences are used.
118 ///
119 /// # Examples
120 ///
121 /// Basic usage:
122 ///
123 /// ```
124 /// use bstr::ByteSlice;
125 ///
126 /// println!("{:?}", b"foo\xFFbar".as_bstr());
127 /// ```
128 #[inline]
129 fn as_bstr(&self) -> &BStr {
130 BStr::new(self.as_bytes())
131 }
132
133 /// Return this byte slice as a `&mut BStr`.
134 ///
135 /// Use `&mut BStr` is useful because of its `fmt::Debug` representation
136 /// and various other trait implementations (such as `PartialEq` and
137 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
138 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
139 /// sequences are used.
140 ///
141 /// # Examples
142 ///
143 /// Basic usage:
144 ///
145 /// ```
146 /// use bstr::ByteSlice;
147 ///
148 /// let mut bytes = *b"foo\xFFbar";
149 /// println!("{:?}", &mut bytes.as_bstr_mut());
150 /// ```
151 #[inline]
152 fn as_bstr_mut(&mut self) -> &mut BStr {
153 BStr::new_mut(self.as_bytes_mut())
154 }
155
156 /// Create an immutable byte string from an OS string slice.
157 ///
158 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
159 /// this returns `None` if the given OS string is not valid UTF-8. (For
160 /// example, on Windows, file paths are allowed to be a sequence of
161 /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
162 /// valid UTF-8.)
163 ///
164 /// # Examples
165 ///
166 /// Basic usage:
167 ///
168 /// ```
169 /// use std::ffi::OsStr;
170 ///
171 /// use bstr::{B, ByteSlice};
172 ///
173 /// let os_str = OsStr::new("foo");
174 /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8");
175 /// assert_eq!(bs, B("foo"));
176 /// ```
177 #[cfg(feature = "std")]
178 #[inline]
179 fn from_os_str(os_str: &OsStr) -> Option<&[u8]> {
180 #[cfg(unix)]
181 #[inline]
182 fn imp(os_str: &OsStr) -> Option<&[u8]> {
183 use std::os::unix::ffi::OsStrExt;
184
185 Some(os_str.as_bytes())
186 }
187
188 #[cfg(not(unix))]
189 #[inline]
190 fn imp(os_str: &OsStr) -> Option<&[u8]> {
191 os_str.to_str().map(|s| s.as_bytes())
192 }
193
194 imp(os_str)
195 }
196
197 /// Create an immutable byte string from a file path.
198 ///
199 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
200 /// this returns `None` if the given path is not valid UTF-8. (For example,
201 /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
202 /// integers. Not all such sequences can be transcoded to valid UTF-8.)
203 ///
204 /// # Examples
205 ///
206 /// Basic usage:
207 ///
208 /// ```
209 /// use std::path::Path;
210 ///
211 /// use bstr::{B, ByteSlice};
212 ///
213 /// let path = Path::new("foo");
214 /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8");
215 /// assert_eq!(bs, B("foo"));
216 /// ```
217 #[cfg(feature = "std")]
218 #[inline]
219 fn from_path(path: &Path) -> Option<&[u8]> {
220 Self::from_os_str(path.as_os_str())
221 }
222
223 /// Safely convert this byte string into a `&str` if it's valid UTF-8.
224 ///
225 /// If this byte string is not valid UTF-8, then an error is returned. The
226 /// error returned indicates the first invalid byte found and the length
227 /// of the error.
228 ///
229 /// In cases where a lossy conversion to `&str` is acceptable, then use one
230 /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or
231 /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into)
232 /// methods.
233 ///
234 /// # Examples
235 ///
236 /// Basic usage:
237 ///
238 /// ```
239 /// use bstr::{B, ByteSlice, ByteVec};
240 ///
241 /// # fn example() -> Result<(), bstr::Utf8Error> {
242 /// let s = B("☃βツ").to_str()?;
243 /// assert_eq!("☃βツ", s);
244 ///
245 /// let mut bstring = <Vec<u8>>::from("☃βツ");
246 /// bstring.push(b'\xFF');
247 /// let err = bstring.to_str().unwrap_err();
248 /// assert_eq!(8, err.valid_up_to());
249 /// # Ok(()) }; example().unwrap()
250 /// ```
251 #[inline]
252 fn to_str(&self) -> Result<&str, Utf8Error> {
253 utf8::validate(self.as_bytes()).map(|_| {
254 // SAFETY: This is safe because of the guarantees provided by
255 // utf8::validate.
256 unsafe { str::from_utf8_unchecked(self.as_bytes()) }
257 })
258 }
259
260 /// Unsafely convert this byte string into a `&str`, without checking for
261 /// valid UTF-8.
262 ///
263 /// # Safety
264 ///
265 /// Callers *must* ensure that this byte string is valid UTF-8 before
266 /// calling this method. Converting a byte string into a `&str` that is
267 /// not valid UTF-8 is considered undefined behavior.
268 ///
269 /// This routine is useful in performance sensitive contexts where the
270 /// UTF-8 validity of the byte string is already known and it is
271 /// undesirable to pay the cost of an additional UTF-8 validation check
272 /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs.
273 ///
274 /// # Examples
275 ///
276 /// Basic usage:
277 ///
278 /// ```
279 /// use bstr::{B, ByteSlice};
280 ///
281 /// // SAFETY: This is safe because string literals are guaranteed to be
282 /// // valid UTF-8 by the Rust compiler.
283 /// let s = unsafe { B("☃βツ").to_str_unchecked() };
284 /// assert_eq!("☃βツ", s);
285 /// ```
286 #[inline]
287 unsafe fn to_str_unchecked(&self) -> &str {
288 str::from_utf8_unchecked(self.as_bytes())
289 }
290
291 /// Convert this byte string to a valid UTF-8 string by replacing invalid
292 /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`).
293 ///
294 /// If the byte string is already valid UTF-8, then no copying or
295 /// allocation is performed and a borrrowed string slice is returned. If
296 /// the byte string is not valid UTF-8, then an owned string buffer is
297 /// returned with invalid bytes replaced by the replacement codepoint.
298 ///
299 /// This method uses the "substitution of maximal subparts" (Unicode
300 /// Standard, Chapter 3, Section 9) strategy for inserting the replacement
301 /// codepoint. Specifically, a replacement codepoint is inserted whenever a
302 /// byte is found that cannot possibly lead to a valid code unit sequence.
303 /// If there were previous bytes that represented a prefix of a well-formed
304 /// code unit sequence, then all of those bytes are substituted with a
305 /// single replacement codepoint. The "substitution of maximal subparts"
306 /// strategy is the same strategy used by
307 /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
308 /// For a more precise description of the maximal subpart strategy, see
309 /// the Unicode Standard, Chapter 3, Section 9. See also
310 /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
311 ///
312 /// N.B. Rust's standard library also appears to use the same strategy,
313 /// but it does not appear to be an API guarantee.
314 ///
315 /// # Examples
316 ///
317 /// Basic usage:
318 ///
319 /// ```
320 /// use std::borrow::Cow;
321 ///
322 /// use bstr::ByteSlice;
323 ///
324 /// let mut bstring = <Vec<u8>>::from("☃βツ");
325 /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy());
326 ///
327 /// // Add a byte that makes the sequence invalid.
328 /// bstring.push(b'\xFF');
329 /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy());
330 /// ```
331 ///
332 /// This demonstrates the "maximal subpart" substitution logic.
333 ///
334 /// ```
335 /// use bstr::{B, ByteSlice};
336 ///
337 /// // \x61 is the ASCII codepoint for 'a'.
338 /// // \xF1\x80\x80 is a valid 3-byte code unit prefix.
339 /// // \xE1\x80 is a valid 2-byte code unit prefix.
340 /// // \xC2 is a valid 1-byte code unit prefix.
341 /// // \x62 is the ASCII codepoint for 'b'.
342 /// //
343 /// // In sum, each of the prefixes is replaced by a single replacement
344 /// // codepoint since none of the prefixes are properly completed. This
345 /// // is in contrast to other strategies that might insert a replacement
346 /// // codepoint for every single byte.
347 /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
348 /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
349 /// ```
350 #[cfg(feature = "std")]
351 #[inline]
352 fn to_str_lossy(&self) -> Cow<str> {
353 match utf8::validate(self.as_bytes()) {
354 Ok(()) => {
355 // SAFETY: This is safe because of the guarantees provided by
356 // utf8::validate.
357 unsafe {
358 Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes()))
359 }
360 }
361 Err(err) => {
362 let mut lossy = String::with_capacity(self.as_bytes().len());
363 let (valid, after) =
364 self.as_bytes().split_at(err.valid_up_to());
365 // SAFETY: This is safe because utf8::validate guarantees
366 // that all of `valid` is valid UTF-8.
367 lossy.push_str(unsafe { str::from_utf8_unchecked(valid) });
368 lossy.push_str("\u{FFFD}");
369 if let Some(len) = err.error_len() {
370 after[len..].to_str_lossy_into(&mut lossy);
371 }
372 Cow::Owned(lossy)
373 }
374 }
375 }
376
377 /// Copy the contents of this byte string into the given owned string
378 /// buffer, while replacing invalid UTF-8 code unit sequences with the
379 /// Unicode replacement codepoint (`U+FFFD`).
380 ///
381 /// This method uses the same "substitution of maximal subparts" strategy
382 /// for inserting the replacement codepoint as the
383 /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method.
384 ///
385 /// This routine is useful for amortizing allocation. However, unlike
386 /// `to_str_lossy`, this routine will _always_ copy the contents of this
387 /// byte string into the destination buffer, even if this byte string is
388 /// valid UTF-8.
389 ///
390 /// # Examples
391 ///
392 /// Basic usage:
393 ///
394 /// ```
395 /// use std::borrow::Cow;
396 ///
397 /// use bstr::ByteSlice;
398 ///
399 /// let mut bstring = <Vec<u8>>::from("☃βツ");
400 /// // Add a byte that makes the sequence invalid.
401 /// bstring.push(b'\xFF');
402 ///
403 /// let mut dest = String::new();
404 /// bstring.to_str_lossy_into(&mut dest);
405 /// assert_eq!("☃βツ\u{FFFD}", dest);
406 /// ```
407 #[cfg(feature = "std")]
408 #[inline]
409 fn to_str_lossy_into(&self, dest: &mut String) {
410 let mut bytes = self.as_bytes();
411 dest.reserve(bytes.len());
412 loop {
413 match utf8::validate(bytes) {
414 Ok(()) => {
415 // SAFETY: This is safe because utf8::validate guarantees
416 // that all of `bytes` is valid UTF-8.
417 dest.push_str(unsafe { str::from_utf8_unchecked(bytes) });
418 break;
419 }
420 Err(err) => {
421 let (valid, after) = bytes.split_at(err.valid_up_to());
422 // SAFETY: This is safe because utf8::validate guarantees
423 // that all of `valid` is valid UTF-8.
424 dest.push_str(unsafe { str::from_utf8_unchecked(valid) });
425 dest.push_str("\u{FFFD}");
426 match err.error_len() {
427 None => break,
428 Some(len) => bytes = &after[len..],
429 }
430 }
431 }
432 }
433 }
434
435 /// Create an OS string slice from this byte string.
436 ///
437 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
438 /// this returns a UTF-8 decoding error if this byte string is not valid
439 /// UTF-8. (For example, on Windows, file paths are allowed to be a
440 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
441 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
442 /// 16-bit integers.)
443 ///
444 /// # Examples
445 ///
446 /// Basic usage:
447 ///
448 /// ```
449 /// use bstr::{B, ByteSlice};
450 ///
451 /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8");
452 /// assert_eq!(os_str, "foo");
453 /// ```
454 #[cfg(feature = "std")]
455 #[inline]
456 fn to_os_str(&self) -> Result<&OsStr, Utf8Error> {
457 #[cfg(unix)]
458 #[inline]
459 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
460 use std::os::unix::ffi::OsStrExt;
461
462 Ok(OsStr::from_bytes(bytes))
463 }
464
465 #[cfg(not(unix))]
466 #[inline]
467 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
468 bytes.to_str().map(OsStr::new)
469 }
470
471 imp(self.as_bytes())
472 }
473
474 /// Lossily create an OS string slice from this byte string.
475 ///
476 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
477 /// this will perform a UTF-8 check and lossily convert this byte string
478 /// into valid UTF-8 using the Unicode replacement codepoint.
479 ///
480 /// Note that this can prevent the correct roundtripping of file paths on
481 /// non-Unix systems such as Windows, where file paths are an arbitrary
482 /// sequence of 16-bit integers.
483 ///
484 /// # Examples
485 ///
486 /// Basic usage:
487 ///
488 /// ```
489 /// use bstr::ByteSlice;
490 ///
491 /// let os_str = b"foo\xFFbar".to_os_str_lossy();
492 /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
493 /// ```
494 #[cfg(feature = "std")]
495 #[inline]
496 fn to_os_str_lossy(&self) -> Cow<OsStr> {
497 #[cfg(unix)]
498 #[inline]
499 fn imp(bytes: &[u8]) -> Cow<OsStr> {
500 use std::os::unix::ffi::OsStrExt;
501
502 Cow::Borrowed(OsStr::from_bytes(bytes))
503 }
504
505 #[cfg(not(unix))]
506 #[inline]
507 fn imp(bytes: &[u8]) -> Cow<OsStr> {
508 use std::ffi::OsString;
509
510 match bytes.to_str_lossy() {
511 Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)),
512 Cow::Owned(x) => Cow::Owned(OsString::from(x)),
513 }
514 }
515
516 imp(self.as_bytes())
517 }
518
519 /// Create a path slice from this byte string.
520 ///
521 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
522 /// this returns a UTF-8 decoding error if this byte string is not valid
523 /// UTF-8. (For example, on Windows, file paths are allowed to be a
524 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
525 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
526 /// 16-bit integers.)
527 ///
528 /// # Examples
529 ///
530 /// Basic usage:
531 ///
532 /// ```
533 /// use bstr::ByteSlice;
534 ///
535 /// let path = b"foo".to_path().expect("should be valid UTF-8");
536 /// assert_eq!(path.as_os_str(), "foo");
537 /// ```
538 #[cfg(feature = "std")]
539 #[inline]
540 fn to_path(&self) -> Result<&Path, Utf8Error> {
541 self.to_os_str().map(Path::new)
542 }
543
544 /// Lossily create a path slice from this byte string.
545 ///
546 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
547 /// this will perform a UTF-8 check and lossily convert this byte string
548 /// into valid UTF-8 using the Unicode replacement codepoint.
549 ///
550 /// Note that this can prevent the correct roundtripping of file paths on
551 /// non-Unix systems such as Windows, where file paths are an arbitrary
552 /// sequence of 16-bit integers.
553 ///
554 /// # Examples
555 ///
556 /// Basic usage:
557 ///
558 /// ```
559 /// use bstr::ByteSlice;
560 ///
561 /// let bs = b"foo\xFFbar";
562 /// let path = bs.to_path_lossy();
563 /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
564 /// ```
565 #[cfg(feature = "std")]
566 #[inline]
567 fn to_path_lossy(&self) -> Cow<Path> {
568 use std::path::PathBuf;
569
570 match self.to_os_str_lossy() {
571 Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)),
572 Cow::Owned(x) => Cow::Owned(PathBuf::from(x)),
573 }
574 }
575
576 /// Create a new byte string by repeating this byte string `n` times.
577 ///
578 /// # Panics
579 ///
580 /// This function panics if the capacity of the new byte string would
581 /// overflow.
582 ///
583 /// # Examples
584 ///
585 /// Basic usage:
586 ///
587 /// ```
588 /// use bstr::{B, ByteSlice};
589 ///
590 /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
591 /// assert_eq!(b"foo".repeatn(0), B(""));
592 /// ```
593 #[cfg(feature = "std")]
594 #[inline]
595 fn repeatn(&self, n: usize) -> Vec<u8> {
596 let bs = self.as_bytes();
597 let mut dst = vec![0; bs.len() * n];
598 for i in 0..n {
599 dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
600 }
601 dst
602 }
603
604 /// Returns true if and only if this byte string contains the given needle.
605 ///
606 /// # Examples
607 ///
608 /// Basic usage:
609 ///
610 /// ```
611 /// use bstr::ByteSlice;
612 ///
613 /// assert!(b"foo bar".contains_str("foo"));
614 /// assert!(b"foo bar".contains_str("bar"));
615 /// assert!(!b"foo".contains_str("foobar"));
616 /// ```
617 #[inline]
618 fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool {
619 self.find(needle).is_some()
620 }
621
622 /// Returns true if and only if this byte string has the given prefix.
623 ///
624 /// # Examples
625 ///
626 /// Basic usage:
627 ///
628 /// ```
629 /// use bstr::ByteSlice;
630 ///
631 /// assert!(b"foo bar".starts_with_str("foo"));
632 /// assert!(!b"foo bar".starts_with_str("bar"));
633 /// assert!(!b"foo".starts_with_str("foobar"));
634 /// ```
635 #[inline]
636 fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool {
637 self.as_bytes().starts_with(prefix.as_ref())
638 }
639
640 /// Returns true if and only if this byte string has the given suffix.
641 ///
642 /// # Examples
643 ///
644 /// Basic usage:
645 ///
646 /// ```
647 /// use bstr::ByteSlice;
648 ///
649 /// assert!(b"foo bar".ends_with_str("bar"));
650 /// assert!(!b"foo bar".ends_with_str("foo"));
651 /// assert!(!b"bar".ends_with_str("foobar"));
652 /// ```
653 #[inline]
654 fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool {
655 self.as_bytes().ends_with(suffix.as_ref())
656 }
657
658 /// Returns the index of the first occurrence of the given needle.
659 ///
660 /// The needle may be any type that can be cheaply converted into a
661 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
662 ///
663 /// Note that if you're are searching for the same needle in many
664 /// different small haystacks, it may be faster to initialize a
665 /// [`Finder`](struct.Finder.html) once, and reuse it for each search.
666 ///
667 /// # Complexity
668 ///
669 /// This routine is guaranteed to have worst case linear time complexity
670 /// with respect to both the needle and the haystack. That is, this runs
671 /// in `O(needle.len() + haystack.len())` time.
672 ///
673 /// This routine is also guaranteed to have worst case constant space
674 /// complexity.
675 ///
676 /// # Examples
677 ///
678 /// Basic usage:
679 ///
680 /// ```
681 /// use bstr::ByteSlice;
682 ///
683 /// let s = b"foo bar baz";
684 /// assert_eq!(Some(0), s.find("foo"));
685 /// assert_eq!(Some(4), s.find("bar"));
686 /// assert_eq!(None, s.find("quux"));
687 /// ```
688 #[inline]
689 fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
690 Finder::new(needle.as_ref()).find(self.as_bytes())
691 }
692
693 /// Returns the index of the last occurrence of the given needle.
694 ///
695 /// The needle may be any type that can be cheaply converted into a
696 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
697 ///
698 /// Note that if you're are searching for the same needle in many
699 /// different small haystacks, it may be faster to initialize a
700 /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for
701 /// each search.
702 ///
703 /// # Complexity
704 ///
705 /// This routine is guaranteed to have worst case linear time complexity
706 /// with respect to both the needle and the haystack. That is, this runs
707 /// in `O(needle.len() + haystack.len())` time.
708 ///
709 /// This routine is also guaranteed to have worst case constant space
710 /// complexity.
711 ///
712 /// # Examples
713 ///
714 /// Basic usage:
715 ///
716 /// ```
717 /// use bstr::ByteSlice;
718 ///
719 /// let s = b"foo bar baz";
720 /// assert_eq!(Some(0), s.rfind("foo"));
721 /// assert_eq!(Some(4), s.rfind("bar"));
722 /// assert_eq!(Some(8), s.rfind("ba"));
723 /// assert_eq!(None, s.rfind("quux"));
724 /// ```
725 #[inline]
726 fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
727 FinderReverse::new(needle.as_ref()).rfind(self.as_bytes())
728 }
729
730 /// Returns an iterator of the non-overlapping occurrences of the given
731 /// needle. The iterator yields byte offset positions indicating the start
732 /// of each match.
733 ///
734 /// # Complexity
735 ///
736 /// This routine is guaranteed to have worst case linear time complexity
737 /// with respect to both the needle and the haystack. That is, this runs
738 /// in `O(needle.len() + haystack.len())` time.
739 ///
740 /// This routine is also guaranteed to have worst case constant space
741 /// complexity.
742 ///
743 /// # Examples
744 ///
745 /// Basic usage:
746 ///
747 /// ```
748 /// use bstr::ByteSlice;
749 ///
750 /// let s = b"foo bar foo foo quux foo";
751 /// let matches: Vec<usize> = s.find_iter("foo").collect();
752 /// assert_eq!(matches, vec![0, 8, 12, 21]);
753 /// ```
754 ///
755 /// An empty string matches at every position, including the position
756 /// immediately following the last byte:
757 ///
758 /// ```
759 /// use bstr::ByteSlice;
760 ///
761 /// let matches: Vec<usize> = b"foo".find_iter("").collect();
762 /// assert_eq!(matches, vec![0, 1, 2, 3]);
763 ///
764 /// let matches: Vec<usize> = b"".find_iter("").collect();
765 /// assert_eq!(matches, vec![0]);
766 /// ```
767 #[inline]
768 fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
769 &'a self,
770 needle: &'a B,
771 ) -> Find<'a> {
772 Find::new(self.as_bytes(), needle.as_ref())
773 }
774
775 /// Returns an iterator of the non-overlapping occurrences of the given
776 /// needle in reverse. The iterator yields byte offset positions indicating
777 /// the start of each match.
778 ///
779 /// # Complexity
780 ///
781 /// This routine is guaranteed to have worst case linear time complexity
782 /// with respect to both the needle and the haystack. That is, this runs
783 /// in `O(needle.len() + haystack.len())` time.
784 ///
785 /// This routine is also guaranteed to have worst case constant space
786 /// complexity.
787 ///
788 /// # Examples
789 ///
790 /// Basic usage:
791 ///
792 /// ```
793 /// use bstr::ByteSlice;
794 ///
795 /// let s = b"foo bar foo foo quux foo";
796 /// let matches: Vec<usize> = s.rfind_iter("foo").collect();
797 /// assert_eq!(matches, vec![21, 12, 8, 0]);
798 /// ```
799 ///
800 /// An empty string matches at every position, including the position
801 /// immediately following the last byte:
802 ///
803 /// ```
804 /// use bstr::ByteSlice;
805 ///
806 /// let matches: Vec<usize> = b"foo".rfind_iter("").collect();
807 /// assert_eq!(matches, vec![3, 2, 1, 0]);
808 ///
809 /// let matches: Vec<usize> = b"".rfind_iter("").collect();
810 /// assert_eq!(matches, vec![0]);
811 /// ```
812 #[inline]
813 fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
814 &'a self,
815 needle: &'a B,
816 ) -> FindReverse<'a> {
817 FindReverse::new(self.as_bytes(), needle.as_ref())
818 }
819
820 /// Returns the index of the first occurrence of the given byte. If the
821 /// byte does not occur in this byte string, then `None` is returned.
822 ///
823 /// # Examples
824 ///
825 /// Basic usage:
826 ///
827 /// ```
828 /// use bstr::ByteSlice;
829 ///
830 /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
831 /// assert_eq!(None, b"foo bar baz".find_byte(b'y'));
832 /// ```
833 #[inline]
834 fn find_byte(&self, byte: u8) -> Option<usize> {
835 memchr(byte, self.as_bytes())
836 }
837
838 /// Returns the index of the last occurrence of the given byte. If the
839 /// byte does not occur in this byte string, then `None` is returned.
840 ///
841 /// # Examples
842 ///
843 /// Basic usage:
844 ///
845 /// ```
846 /// use bstr::ByteSlice;
847 ///
848 /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
849 /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
850 /// ```
851 #[inline]
852 fn rfind_byte(&self, byte: u8) -> Option<usize> {
853 memrchr(byte, self.as_bytes())
854 }
855
856 /// Returns the index of the first occurrence of the given codepoint.
857 /// If the codepoint does not occur in this byte string, then `None` is
858 /// returned.
859 ///
860 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
861 /// then only explicit occurrences of that encoding will be found. Invalid
862 /// UTF-8 sequences will not be matched.
863 ///
864 /// # Examples
865 ///
866 /// Basic usage:
867 ///
868 /// ```
869 /// use bstr::{B, ByteSlice};
870 ///
871 /// assert_eq!(Some(10), b"foo bar baz".find_char('z'));
872 /// assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
873 /// assert_eq!(None, b"foo bar baz".find_char('y'));
874 /// ```
875 #[inline]
876 fn find_char(&self, ch: char) -> Option<usize> {
877 self.find(ch.encode_utf8(&mut [0; 4]))
878 }
879
880 /// Returns the index of the last occurrence of the given codepoint.
881 /// If the codepoint does not occur in this byte string, then `None` is
882 /// returned.
883 ///
884 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
885 /// then only explicit occurrences of that encoding will be found. Invalid
886 /// UTF-8 sequences will not be matched.
887 ///
888 /// # Examples
889 ///
890 /// Basic usage:
891 ///
892 /// ```
893 /// use bstr::{B, ByteSlice};
894 ///
895 /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
896 /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
897 /// assert_eq!(None, b"foo bar baz".rfind_char('y'));
898 /// ```
899 #[inline]
900 fn rfind_char(&self, ch: char) -> Option<usize> {
901 self.rfind(ch.encode_utf8(&mut [0; 4]))
902 }
903
904 /// Returns the index of the first occurrence of any of the bytes in the
905 /// provided set.
906 ///
907 /// The `byteset` may be any type that can be cheaply converted into a
908 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
909 /// note that passing a `&str` which contains multibyte characters may not
910 /// behave as you expect: each byte in the `&str` is treated as an
911 /// individual member of the byte set.
912 ///
913 /// Note that order is irrelevant for the `byteset` parameter, and
914 /// duplicate bytes present in its body are ignored.
915 ///
916 /// # Complexity
917 ///
918 /// This routine is guaranteed to have worst case linear time complexity
919 /// with respect to both the set of bytes and the haystack. That is, this
920 /// runs in `O(byteset.len() + haystack.len())` time.
921 ///
922 /// This routine is also guaranteed to have worst case constant space
923 /// complexity.
924 ///
925 /// # Examples
926 ///
927 /// Basic usage:
928 ///
929 /// ```
930 /// use bstr::ByteSlice;
931 ///
932 /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
933 /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
934 /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
935 /// ```
936 #[inline]
937 fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
938 byteset::find(self.as_bytes(), byteset.as_ref())
939 }
940
941 /// Returns the index of the first occurrence of a byte that is not a member
942 /// of the provided set.
943 ///
944 /// The `byteset` may be any type that can be cheaply converted into a
945 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
946 /// note that passing a `&str` which contains multibyte characters may not
947 /// behave as you expect: each byte in the `&str` is treated as an
948 /// individual member of the byte set.
949 ///
950 /// Note that order is irrelevant for the `byteset` parameter, and
951 /// duplicate bytes present in its body are ignored.
952 ///
953 /// # Complexity
954 ///
955 /// This routine is guaranteed to have worst case linear time complexity
956 /// with respect to both the set of bytes and the haystack. That is, this
957 /// runs in `O(byteset.len() + haystack.len())` time.
958 ///
959 /// This routine is also guaranteed to have worst case constant space
960 /// complexity.
961 ///
962 /// # Examples
963 ///
964 /// Basic usage:
965 ///
966 /// ```
967 /// use bstr::ByteSlice;
968 ///
969 /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
970 /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
971 /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
972 /// ```
973 #[inline]
974 fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
975 byteset::find_not(self.as_bytes(), byteset.as_ref())
976 }
977
978 /// Returns the index of the last occurrence of any of the bytes in the
979 /// provided set.
980 ///
981 /// The `byteset` may be any type that can be cheaply converted into a
982 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
983 /// note that passing a `&str` which contains multibyte characters may not
984 /// behave as you expect: each byte in the `&str` is treated as an
985 /// individual member of the byte set.
986 ///
987 /// Note that order is irrelevant for the `byteset` parameter, and duplicate
988 /// bytes present in its body are ignored.
989 ///
990 /// # Complexity
991 ///
992 /// This routine is guaranteed to have worst case linear time complexity
993 /// with respect to both the set of bytes and the haystack. That is, this
994 /// runs in `O(byteset.len() + haystack.len())` time.
995 ///
996 /// This routine is also guaranteed to have worst case constant space
997 /// complexity.
998 ///
999 /// # Examples
1000 ///
1001 /// Basic usage:
1002 ///
1003 /// ```
1004 /// use bstr::ByteSlice;
1005 ///
1006 /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
1007 /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
1008 /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
1009 /// ```
1010 #[inline]
1011 fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1012 byteset::rfind(self.as_bytes(), byteset.as_ref())
1013 }
1014
1015 /// Returns the index of the last occurrence of a byte that is not a member
1016 /// of the provided set.
1017 ///
1018 /// The `byteset` may be any type that can be cheaply converted into a
1019 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
1020 /// note that passing a `&str` which contains multibyte characters may not
1021 /// behave as you expect: each byte in the `&str` is treated as an
1022 /// individual member of the byte set.
1023 ///
1024 /// Note that order is irrelevant for the `byteset` parameter, and
1025 /// duplicate bytes present in its body are ignored.
1026 ///
1027 /// # Complexity
1028 ///
1029 /// This routine is guaranteed to have worst case linear time complexity
1030 /// with respect to both the set of bytes and the haystack. That is, this
1031 /// runs in `O(byteset.len() + haystack.len())` time.
1032 ///
1033 /// This routine is also guaranteed to have worst case constant space
1034 /// complexity.
1035 ///
1036 /// # Examples
1037 ///
1038 /// Basic usage:
1039 ///
1040 /// ```
1041 /// use bstr::ByteSlice;
1042 ///
1043 /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
1044 /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
1045 /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
1046 /// ```
1047 #[inline]
1048 fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1049 byteset::rfind_not(self.as_bytes(), byteset.as_ref())
1050 }
1051
1052 /// Returns an iterator over the fields in a byte string, separated by
1053 /// contiguous whitespace.
1054 ///
1055 /// # Example
1056 ///
1057 /// Basic usage:
1058 ///
1059 /// ```
1060 /// use bstr::{B, ByteSlice};
1061 ///
1062 /// let s = B(" foo\tbar\t\u{2003}\nquux \n");
1063 /// let fields: Vec<&[u8]> = s.fields().collect();
1064 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1065 /// ```
1066 ///
1067 /// A byte string consisting of just whitespace yields no elements:
1068 ///
1069 /// ```
1070 /// use bstr::{B, ByteSlice};
1071 ///
1072 /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
1073 /// ```
1074 #[inline]
1075 fn fields(&self) -> Fields {
1076 Fields::new(self.as_bytes())
1077 }
1078
1079 /// Returns an iterator over the fields in a byte string, separated by
1080 /// contiguous codepoints satisfying the given predicate.
1081 ///
1082 /// If this byte string is not valid UTF-8, then the given closure will
1083 /// be called with a Unicode replacement codepoint when invalid UTF-8
1084 /// bytes are seen.
1085 ///
1086 /// # Example
1087 ///
1088 /// Basic usage:
1089 ///
1090 /// ```
1091 /// use bstr::{B, ByteSlice};
1092 ///
1093 /// let s = b"123foo999999bar1quux123456";
1094 /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
1095 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1096 /// ```
1097 ///
1098 /// A byte string consisting of all codepoints satisfying the predicate
1099 /// yields no elements:
1100 ///
1101 /// ```
1102 /// use bstr::ByteSlice;
1103 ///
1104 /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
1105 /// ```
1106 #[inline]
1107 fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<F> {
1108 FieldsWith::new(self.as_bytes(), f)
1109 }
1110
1111 /// Returns an iterator over substrings of this byte string, separated
1112 /// by the given byte string. Each element yielded is guaranteed not to
1113 /// include the splitter substring.
1114 ///
1115 /// The splitter may be any type that can be cheaply converted into a
1116 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1117 ///
1118 /// # Examples
1119 ///
1120 /// Basic usage:
1121 ///
1122 /// ```
1123 /// use bstr::{B, ByteSlice};
1124 ///
1125 /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
1126 /// assert_eq!(x, vec![
1127 /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
1128 /// ]);
1129 ///
1130 /// let x: Vec<&[u8]> = b"".split_str("X").collect();
1131 /// assert_eq!(x, vec![b""]);
1132 ///
1133 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
1134 /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
1135 ///
1136 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
1137 /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
1138 /// ```
1139 ///
1140 /// If a string contains multiple contiguous separators, you will end up
1141 /// with empty strings yielded by the iterator:
1142 ///
1143 /// ```
1144 /// use bstr::{B, ByteSlice};
1145 ///
1146 /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
1147 /// assert_eq!(x, vec![
1148 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1149 /// ]);
1150 ///
1151 /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
1152 /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
1153 /// ```
1154 ///
1155 /// Separators at the start or end of a string are neighbored by empty
1156 /// strings.
1157 ///
1158 /// ```
1159 /// use bstr::{B, ByteSlice};
1160 ///
1161 /// let x: Vec<&[u8]> = b"010".split_str("0").collect();
1162 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1163 /// ```
1164 ///
1165 /// When the empty string is used as a separator, it splits every **byte**
1166 /// in the byte string, along with the beginning and end of the byte
1167 /// string.
1168 ///
1169 /// ```
1170 /// use bstr::{B, ByteSlice};
1171 ///
1172 /// let x: Vec<&[u8]> = b"rust".split_str("").collect();
1173 /// assert_eq!(x, vec![
1174 /// B(""), B("r"), B("u"), B("s"), B("t"), B(""),
1175 /// ]);
1176 ///
1177 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1178 /// // may not be valid UTF-8!
1179 /// let x: Vec<&[u8]> = B("☃").split_str("").collect();
1180 /// assert_eq!(x, vec![
1181 /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
1182 /// ]);
1183 /// ```
1184 ///
1185 /// Contiguous separators, especially whitespace, can lead to possibly
1186 /// surprising behavior. For example, this code is correct:
1187 ///
1188 /// ```
1189 /// use bstr::{B, ByteSlice};
1190 ///
1191 /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
1192 /// assert_eq!(x, vec![
1193 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1194 /// ]);
1195 /// ```
1196 ///
1197 /// It does *not* give you `["a", "b", "c"]`. For that behavior, use
1198 /// [`fields`](#method.fields) instead.
1199 #[inline]
1200 fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
1201 &'a self,
1202 splitter: &'a B,
1203 ) -> Split<'a> {
1204 Split::new(self.as_bytes(), splitter.as_ref())
1205 }
1206
1207 /// Returns an iterator over substrings of this byte string, separated by
1208 /// the given byte string, in reverse. Each element yielded is guaranteed
1209 /// not to include the splitter substring.
1210 ///
1211 /// The splitter may be any type that can be cheaply converted into a
1212 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1213 ///
1214 /// # Examples
1215 ///
1216 /// Basic usage:
1217 ///
1218 /// ```
1219 /// use bstr::{B, ByteSlice};
1220 ///
1221 /// let x: Vec<&[u8]> =
1222 /// b"Mary had a little lamb".rsplit_str(" ").collect();
1223 /// assert_eq!(x, vec![
1224 /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
1225 /// ]);
1226 ///
1227 /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
1228 /// assert_eq!(x, vec![b""]);
1229 ///
1230 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
1231 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
1232 ///
1233 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
1234 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
1235 /// ```
1236 ///
1237 /// If a string contains multiple contiguous separators, you will end up
1238 /// with empty strings yielded by the iterator:
1239 ///
1240 /// ```
1241 /// use bstr::{B, ByteSlice};
1242 ///
1243 /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
1244 /// assert_eq!(x, vec![
1245 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1246 /// ]);
1247 ///
1248 /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
1249 /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
1250 /// ```
1251 ///
1252 /// Separators at the start or end of a string are neighbored by empty
1253 /// strings.
1254 ///
1255 /// ```
1256 /// use bstr::{B, ByteSlice};
1257 ///
1258 /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
1259 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1260 /// ```
1261 ///
1262 /// When the empty string is used as a separator, it splits every **byte**
1263 /// in the byte string, along with the beginning and end of the byte
1264 /// string.
1265 ///
1266 /// ```
1267 /// use bstr::{B, ByteSlice};
1268 ///
1269 /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
1270 /// assert_eq!(x, vec![
1271 /// B(""), B("t"), B("s"), B("u"), B("r"), B(""),
1272 /// ]);
1273 ///
1274 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1275 /// // may not be valid UTF-8!
1276 /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
1277 /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
1278 /// ```
1279 ///
1280 /// Contiguous separators, especially whitespace, can lead to possibly
1281 /// surprising behavior. For example, this code is correct:
1282 ///
1283 /// ```
1284 /// use bstr::{B, ByteSlice};
1285 ///
1286 /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
1287 /// assert_eq!(x, vec![
1288 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1289 /// ]);
1290 /// ```
1291 ///
1292 /// It does *not* give you `["a", "b", "c"]`.
1293 #[inline]
1294 fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
1295 &'a self,
1296 splitter: &'a B,
1297 ) -> SplitReverse<'a> {
1298 SplitReverse::new(self.as_bytes(), splitter.as_ref())
1299 }
1300
1301 /// Returns an iterator of at most `limit` substrings of this byte string,
1302 /// separated by the given byte string. If `limit` substrings are yielded,
1303 /// then the last substring will contain the remainder of this byte string.
1304 ///
1305 /// The needle may be any type that can be cheaply converted into a
1306 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1307 ///
1308 /// # Examples
1309 ///
1310 /// Basic usage:
1311 ///
1312 /// ```
1313 /// use bstr::{B, ByteSlice};
1314 ///
1315 /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
1316 /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
1317 ///
1318 /// let x: Vec<_> = b"".splitn_str(3, "X").collect();
1319 /// assert_eq!(x, vec![b""]);
1320 ///
1321 /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
1322 /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
1323 ///
1324 /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
1325 /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
1326 ///
1327 /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
1328 /// assert_eq!(x, vec![B("abcXdef")]);
1329 ///
1330 /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
1331 /// assert_eq!(x, vec![B("abcdef")]);
1332 ///
1333 /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
1334 /// assert!(x.is_empty());
1335 /// ```
1336 #[inline]
1337 fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1338 &'a self,
1339 limit: usize,
1340 splitter: &'a B,
1341 ) -> SplitN<'a> {
1342 SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
1343 }
1344
1345 /// Returns an iterator of at most `limit` substrings of this byte string,
1346 /// separated by the given byte string, in reverse. If `limit` substrings
1347 /// are yielded, then the last substring will contain the remainder of this
1348 /// byte string.
1349 ///
1350 /// The needle may be any type that can be cheaply converted into a
1351 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1352 ///
1353 /// # Examples
1354 ///
1355 /// Basic usage:
1356 ///
1357 /// ```
1358 /// use bstr::{B, ByteSlice};
1359 ///
1360 /// let x: Vec<_> =
1361 /// b"Mary had a little lamb".rsplitn_str(3, " ").collect();
1362 /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
1363 ///
1364 /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
1365 /// assert_eq!(x, vec![b""]);
1366 ///
1367 /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
1368 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
1369 ///
1370 /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
1371 /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
1372 ///
1373 /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
1374 /// assert_eq!(x, vec![B("abcXdef")]);
1375 ///
1376 /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
1377 /// assert_eq!(x, vec![B("abcdef")]);
1378 ///
1379 /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
1380 /// assert!(x.is_empty());
1381 /// ```
1382 #[inline]
1383 fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1384 &'a self,
1385 limit: usize,
1386 splitter: &'a B,
1387 ) -> SplitNReverse<'a> {
1388 SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
1389 }
1390
1391 /// Replace all matches of the given needle with the given replacement, and
1392 /// the result as a new `Vec<u8>`.
1393 ///
1394 /// This routine is useful as a convenience. If you need to reuse an
1395 /// allocation, use [`replace_into`](#method.replace_into) instead.
1396 ///
1397 /// # Examples
1398 ///
1399 /// Basic usage:
1400 ///
1401 /// ```
1402 /// use bstr::ByteSlice;
1403 ///
1404 /// let s = b"this is old".replace("old", "new");
1405 /// assert_eq!(s, "this is new".as_bytes());
1406 /// ```
1407 ///
1408 /// When the pattern doesn't match:
1409 ///
1410 /// ```
1411 /// use bstr::ByteSlice;
1412 ///
1413 /// let s = b"this is old".replace("nada nada", "limonada");
1414 /// assert_eq!(s, "this is old".as_bytes());
1415 /// ```
1416 ///
1417 /// When the needle is an empty string:
1418 ///
1419 /// ```
1420 /// use bstr::ByteSlice;
1421 ///
1422 /// let s = b"foo".replace("", "Z");
1423 /// assert_eq!(s, "ZfZoZoZ".as_bytes());
1424 /// ```
1425 #[cfg(feature = "std")]
1426 #[inline]
1427 fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1428 &self,
1429 needle: N,
1430 replacement: R,
1431 ) -> Vec<u8> {
1432 let mut dest = Vec::with_capacity(self.as_bytes().len());
1433 self.replace_into(needle, replacement, &mut dest);
1434 dest
1435 }
1436
1437 /// Replace up to `limit` matches of the given needle with the given
1438 /// replacement, and the result as a new `Vec<u8>`.
1439 ///
1440 /// This routine is useful as a convenience. If you need to reuse an
1441 /// allocation, use [`replacen_into`](#method.replacen_into) instead.
1442 ///
1443 /// # Examples
1444 ///
1445 /// Basic usage:
1446 ///
1447 /// ```
1448 /// use bstr::ByteSlice;
1449 ///
1450 /// let s = b"foofoo".replacen("o", "z", 2);
1451 /// assert_eq!(s, "fzzfoo".as_bytes());
1452 /// ```
1453 ///
1454 /// When the pattern doesn't match:
1455 ///
1456 /// ```
1457 /// use bstr::ByteSlice;
1458 ///
1459 /// let s = b"foofoo".replacen("a", "z", 2);
1460 /// assert_eq!(s, "foofoo".as_bytes());
1461 /// ```
1462 ///
1463 /// When the needle is an empty string:
1464 ///
1465 /// ```
1466 /// use bstr::ByteSlice;
1467 ///
1468 /// let s = b"foo".replacen("", "Z", 2);
1469 /// assert_eq!(s, "ZfZoo".as_bytes());
1470 /// ```
1471 #[cfg(feature = "std")]
1472 #[inline]
1473 fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1474 &self,
1475 needle: N,
1476 replacement: R,
1477 limit: usize,
1478 ) -> Vec<u8> {
1479 let mut dest = Vec::with_capacity(self.as_bytes().len());
1480 self.replacen_into(needle, replacement, limit, &mut dest);
1481 dest
1482 }
1483
1484 /// Replace all matches of the given needle with the given replacement,
1485 /// and write the result into the provided `Vec<u8>`.
1486 ///
1487 /// This does **not** clear `dest` before writing to it.
1488 ///
1489 /// This routine is useful for reusing allocation. For a more convenient
1490 /// API, use [`replace`](#method.replace) instead.
1491 ///
1492 /// # Examples
1493 ///
1494 /// Basic usage:
1495 ///
1496 /// ```
1497 /// use bstr::ByteSlice;
1498 ///
1499 /// let s = b"this is old";
1500 ///
1501 /// let mut dest = vec![];
1502 /// s.replace_into("old", "new", &mut dest);
1503 /// assert_eq!(dest, "this is new".as_bytes());
1504 /// ```
1505 ///
1506 /// When the pattern doesn't match:
1507 ///
1508 /// ```
1509 /// use bstr::ByteSlice;
1510 ///
1511 /// let s = b"this is old";
1512 ///
1513 /// let mut dest = vec![];
1514 /// s.replace_into("nada nada", "limonada", &mut dest);
1515 /// assert_eq!(dest, "this is old".as_bytes());
1516 /// ```
1517 ///
1518 /// When the needle is an empty string:
1519 ///
1520 /// ```
1521 /// use bstr::ByteSlice;
1522 ///
1523 /// let s = b"foo";
1524 ///
1525 /// let mut dest = vec![];
1526 /// s.replace_into("", "Z", &mut dest);
1527 /// assert_eq!(dest, "ZfZoZoZ".as_bytes());
1528 /// ```
1529 #[cfg(feature = "std")]
1530 #[inline]
1531 fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1532 &self,
1533 needle: N,
1534 replacement: R,
1535 dest: &mut Vec<u8>,
1536 ) {
1537 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1538
1539 let mut last = 0;
1540 for start in self.find_iter(needle) {
1541 dest.push_str(&self.as_bytes()[last..start]);
1542 dest.push_str(replacement);
1543 last = start + needle.len();
1544 }
1545 dest.push_str(&self.as_bytes()[last..]);
1546 }
1547
1548 /// Replace up to `limit` matches of the given needle with the given
1549 /// replacement, and write the result into the provided `Vec<u8>`.
1550 ///
1551 /// This does **not** clear `dest` before writing to it.
1552 ///
1553 /// This routine is useful for reusing allocation. For a more convenient
1554 /// API, use [`replacen`](#method.replacen) instead.
1555 ///
1556 /// # Examples
1557 ///
1558 /// Basic usage:
1559 ///
1560 /// ```
1561 /// use bstr::ByteSlice;
1562 ///
1563 /// let s = b"foofoo";
1564 ///
1565 /// let mut dest = vec![];
1566 /// s.replacen_into("o", "z", 2, &mut dest);
1567 /// assert_eq!(dest, "fzzfoo".as_bytes());
1568 /// ```
1569 ///
1570 /// When the pattern doesn't match:
1571 ///
1572 /// ```
1573 /// use bstr::ByteSlice;
1574 ///
1575 /// let s = b"foofoo";
1576 ///
1577 /// let mut dest = vec![];
1578 /// s.replacen_into("a", "z", 2, &mut dest);
1579 /// assert_eq!(dest, "foofoo".as_bytes());
1580 /// ```
1581 ///
1582 /// When the needle is an empty string:
1583 ///
1584 /// ```
1585 /// use bstr::ByteSlice;
1586 ///
1587 /// let s = b"foo";
1588 ///
1589 /// let mut dest = vec![];
1590 /// s.replacen_into("", "Z", 2, &mut dest);
1591 /// assert_eq!(dest, "ZfZoo".as_bytes());
1592 /// ```
1593 #[cfg(feature = "std")]
1594 #[inline]
1595 fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1596 &self,
1597 needle: N,
1598 replacement: R,
1599 limit: usize,
1600 dest: &mut Vec<u8>,
1601 ) {
1602 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1603
1604 let mut last = 0;
1605 for start in self.find_iter(needle).take(limit) {
1606 dest.push_str(&self.as_bytes()[last..start]);
1607 dest.push_str(replacement);
1608 last = start + needle.len();
1609 }
1610 dest.push_str(&self.as_bytes()[last..]);
1611 }
1612
1613 /// Returns an iterator over the bytes in this byte string.
1614 ///
1615 /// # Examples
1616 ///
1617 /// Basic usage:
1618 ///
1619 /// ```
1620 /// use bstr::ByteSlice;
1621 ///
1622 /// let bs = b"foobar";
1623 /// let bytes: Vec<u8> = bs.bytes().collect();
1624 /// assert_eq!(bytes, bs);
1625 /// ```
1626 #[inline]
1627 fn bytes(&self) -> Bytes {
1628 Bytes { it: self.as_bytes().iter() }
1629 }
1630
1631 /// Returns an iterator over the Unicode scalar values in this byte string.
1632 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1633 /// is yielded instead.
1634 ///
1635 /// # Examples
1636 ///
1637 /// Basic usage:
1638 ///
1639 /// ```
1640 /// use bstr::ByteSlice;
1641 ///
1642 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1643 /// let chars: Vec<char> = bs.chars().collect();
1644 /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
1645 /// ```
1646 ///
1647 /// Codepoints can also be iterated over in reverse:
1648 ///
1649 /// ```
1650 /// use bstr::ByteSlice;
1651 ///
1652 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1653 /// let chars: Vec<char> = bs.chars().rev().collect();
1654 /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
1655 /// ```
1656 #[inline]
1657 fn chars(&self) -> Chars {
1658 Chars::new(self.as_bytes())
1659 }
1660
1661 /// Returns an iterator over the Unicode scalar values in this byte string
1662 /// along with their starting and ending byte index positions. If invalid
1663 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1664 /// instead.
1665 ///
1666 /// Note that this is slightly different from the `CharIndices` iterator
1667 /// provided by the standard library. Aside from working on possibly
1668 /// invalid UTF-8, this iterator provides both the corresponding starting
1669 /// and ending byte indices of each codepoint yielded. The ending position
1670 /// is necessary to slice the original byte string when invalid UTF-8 bytes
1671 /// are converted into a Unicode replacement codepoint, since a single
1672 /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
1673 /// (inclusive).
1674 ///
1675 /// # Examples
1676 ///
1677 /// Basic usage:
1678 ///
1679 /// ```
1680 /// use bstr::ByteSlice;
1681 ///
1682 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1683 /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
1684 /// assert_eq!(chars, vec![
1685 /// (0, 3, '☃'),
1686 /// (3, 4, '\u{FFFD}'),
1687 /// (4, 8, '𝞃'),
1688 /// (8, 10, '\u{FFFD}'),
1689 /// (10, 11, 'a'),
1690 /// ]);
1691 /// ```
1692 ///
1693 /// Codepoints can also be iterated over in reverse:
1694 ///
1695 /// ```
1696 /// use bstr::ByteSlice;
1697 ///
1698 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1699 /// let chars: Vec<(usize, usize, char)> = bs
1700 /// .char_indices()
1701 /// .rev()
1702 /// .collect();
1703 /// assert_eq!(chars, vec![
1704 /// (10, 11, 'a'),
1705 /// (8, 10, '\u{FFFD}'),
1706 /// (4, 8, '𝞃'),
1707 /// (3, 4, '\u{FFFD}'),
1708 /// (0, 3, '☃'),
1709 /// ]);
1710 /// ```
1711 #[inline]
1712 fn char_indices(&self) -> CharIndices {
1713 CharIndices::new(self.as_bytes())
1714 }
1715
1716 /// Iterate over chunks of valid UTF-8.
1717 ///
1718 /// The iterator returned yields chunks of valid UTF-8 separated by invalid
1719 /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
1720 /// which are determined via the "substitution of maximal subparts"
1721 /// strategy described in the docs for the
1722 /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
1723 /// method.
1724 ///
1725 /// # Examples
1726 ///
1727 /// This example shows how the `std::fmt::Display` implementation is
1728 /// written for the `BStr` type:
1729 ///
1730 /// ```
1731 /// use bstr::{ByteSlice, Utf8Chunk};
1732 ///
1733 /// let bytes = b"foo\xFD\xFEbar\xFF";
1734 ///
1735 /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
1736 /// for chunk in bytes.utf8_chunks() {
1737 /// if !chunk.valid().is_empty() {
1738 /// valid_chunks.push(chunk.valid());
1739 /// }
1740 /// if !chunk.invalid().is_empty() {
1741 /// invalid_chunks.push(chunk.invalid());
1742 /// }
1743 /// }
1744 ///
1745 /// assert_eq!(valid_chunks, vec!["foo", "bar"]);
1746 /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
1747 /// ```
1748 #[inline]
1749 fn utf8_chunks(&self) -> Utf8Chunks {
1750 Utf8Chunks { bytes: self.as_bytes() }
1751 }
1752
1753 /// Returns an iterator over the grapheme clusters in this byte string.
1754 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1755 /// is yielded instead.
1756 ///
1757 /// # Examples
1758 ///
1759 /// This example shows how multiple codepoints can combine to form a
1760 /// single grapheme cluster:
1761 ///
1762 /// ```
1763 /// use bstr::ByteSlice;
1764 ///
1765 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1766 /// let graphemes: Vec<&str> = bs.graphemes().collect();
1767 /// assert_eq!(vec!["à̖", "🇺🇸"], graphemes);
1768 /// ```
1769 ///
1770 /// This shows that graphemes can be iterated over in reverse:
1771 ///
1772 /// ```
1773 /// use bstr::ByteSlice;
1774 ///
1775 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1776 /// let graphemes: Vec<&str> = bs.graphemes().rev().collect();
1777 /// assert_eq!(vec!["🇺🇸", "à̖"], graphemes);
1778 /// ```
1779 #[cfg(feature = "unicode")]
1780 #[inline]
1781 fn graphemes(&self) -> Graphemes {
1782 Graphemes::new(self.as_bytes())
1783 }
1784
1785 /// Returns an iterator over the grapheme clusters in this byte string
1786 /// along with their starting and ending byte index positions. If invalid
1787 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1788 /// instead.
1789 ///
1790 /// # Examples
1791 ///
1792 /// This example shows how to get the byte offsets of each individual
1793 /// grapheme cluster:
1794 ///
1795 /// ```
1796 /// use bstr::ByteSlice;
1797 ///
1798 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1799 /// let graphemes: Vec<(usize, usize, &str)> =
1800 /// bs.grapheme_indices().collect();
1801 /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes);
1802 /// ```
1803 ///
1804 /// This example shows what happens when invalid UTF-8 is enountered. Note
1805 /// that the offsets are valid indices into the original string, and do
1806 /// not necessarily correspond to the length of the `&str` returned!
1807 ///
1808 /// ```
1809 /// use bstr::{ByteSlice, ByteVec};
1810 ///
1811 /// let mut bytes = vec![];
1812 /// bytes.push_str("a\u{0300}\u{0316}");
1813 /// bytes.push(b'\xFF');
1814 /// bytes.push_str("\u{1F1FA}\u{1F1F8}");
1815 ///
1816 /// let graphemes: Vec<(usize, usize, &str)> =
1817 /// bytes.grapheme_indices().collect();
1818 /// assert_eq!(
1819 /// graphemes,
1820 /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")]
1821 /// );
1822 /// ```
1823 #[cfg(feature = "unicode")]
1824 #[inline]
1825 fn grapheme_indices(&self) -> GraphemeIndices {
1826 GraphemeIndices::new(self.as_bytes())
1827 }
1828
1829 /// Returns an iterator over the words in this byte string. If invalid
1830 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1831 /// instead.
1832 ///
1833 /// This is similar to
1834 /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks),
1835 /// except it only returns elements that contain a "word" character. A word
1836 /// character is defined by UTS #18 (Annex C) to be the combination of the
1837 /// `Alphabetic` and `Join_Control` properties, along with the
1838 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1839 /// categories.
1840 ///
1841 /// Since words are made up of one or more codepoints, this iterator
1842 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1843 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1844 ///
1845 /// # Examples
1846 ///
1847 /// Basic usage:
1848 ///
1849 /// ```
1850 /// use bstr::ByteSlice;
1851 ///
1852 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1853 /// let words: Vec<&str> = bs.words().collect();
1854 /// assert_eq!(words, vec![
1855 /// "The", "quick", "brown", "fox", "can't",
1856 /// "jump", "32.3", "feet", "right",
1857 /// ]);
1858 /// ```
1859 #[cfg(feature = "unicode")]
1860 #[inline]
1861 fn words(&self) -> Words {
1862 Words::new(self.as_bytes())
1863 }
1864
1865 /// Returns an iterator over the words in this byte string along with
1866 /// their starting and ending byte index positions.
1867 ///
1868 /// This is similar to
1869 /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices),
1870 /// except it only returns elements that contain a "word" character. A word
1871 /// character is defined by UTS #18 (Annex C) to be the combination of the
1872 /// `Alphabetic` and `Join_Control` properties, along with the
1873 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1874 /// categories.
1875 ///
1876 /// Since words are made up of one or more codepoints, this iterator
1877 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1878 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1879 ///
1880 /// # Examples
1881 ///
1882 /// This example shows how to get the byte offsets of each individual
1883 /// word:
1884 ///
1885 /// ```
1886 /// use bstr::ByteSlice;
1887 ///
1888 /// let bs = b"can't jump 32.3 feet";
1889 /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect();
1890 /// assert_eq!(words, vec![
1891 /// (0, 5, "can't"),
1892 /// (6, 10, "jump"),
1893 /// (11, 15, "32.3"),
1894 /// (16, 20, "feet"),
1895 /// ]);
1896 /// ```
1897 #[cfg(feature = "unicode")]
1898 #[inline]
1899 fn word_indices(&self) -> WordIndices {
1900 WordIndices::new(self.as_bytes())
1901 }
1902
1903 /// Returns an iterator over the words in this byte string, along with
1904 /// all breaks between the words. Concatenating all elements yielded by
1905 /// the iterator results in the original string (modulo Unicode replacement
1906 /// codepoint substitutions if invalid UTF-8 is encountered).
1907 ///
1908 /// Since words are made up of one or more codepoints, this iterator
1909 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1910 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1911 ///
1912 /// # Examples
1913 ///
1914 /// Basic usage:
1915 ///
1916 /// ```
1917 /// use bstr::ByteSlice;
1918 ///
1919 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1920 /// let words: Vec<&str> = bs.words_with_breaks().collect();
1921 /// assert_eq!(words, vec![
1922 /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")",
1923 /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet",
1924 /// ",", " ", "right", "?",
1925 /// ]);
1926 /// ```
1927 #[cfg(feature = "unicode")]
1928 #[inline]
1929 fn words_with_breaks(&self) -> WordsWithBreaks {
1930 WordsWithBreaks::new(self.as_bytes())
1931 }
1932
1933 /// Returns an iterator over the words and their byte offsets in this
1934 /// byte string, along with all breaks between the words. Concatenating
1935 /// all elements yielded by the iterator results in the original string
1936 /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is
1937 /// encountered).
1938 ///
1939 /// Since words are made up of one or more codepoints, this iterator
1940 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1941 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1942 ///
1943 /// # Examples
1944 ///
1945 /// This example shows how to get the byte offsets of each individual
1946 /// word:
1947 ///
1948 /// ```
1949 /// use bstr::ByteSlice;
1950 ///
1951 /// let bs = b"can't jump 32.3 feet";
1952 /// let words: Vec<(usize, usize, &str)> =
1953 /// bs.words_with_break_indices().collect();
1954 /// assert_eq!(words, vec![
1955 /// (0, 5, "can't"),
1956 /// (5, 6, " "),
1957 /// (6, 10, "jump"),
1958 /// (10, 11, " "),
1959 /// (11, 15, "32.3"),
1960 /// (15, 16, " "),
1961 /// (16, 20, "feet"),
1962 /// ]);
1963 /// ```
1964 #[cfg(feature = "unicode")]
1965 #[inline]
1966 fn words_with_break_indices(&self) -> WordsWithBreakIndices {
1967 WordsWithBreakIndices::new(self.as_bytes())
1968 }
1969
1970 /// Returns an iterator over the sentences in this byte string.
1971 ///
1972 /// Typically, a sentence will include its trailing punctuation and
1973 /// whitespace. Concatenating all elements yielded by the iterator
1974 /// results in the original string (modulo Unicode replacement codepoint
1975 /// substitutions if invalid UTF-8 is encountered).
1976 ///
1977 /// Since sentences are made up of one or more codepoints, this iterator
1978 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1979 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1980 ///
1981 /// # Examples
1982 ///
1983 /// Basic usage:
1984 ///
1985 /// ```
1986 /// use bstr::ByteSlice;
1987 ///
1988 /// let bs = b"I want this. Not that. Right now.";
1989 /// let sentences: Vec<&str> = bs.sentences().collect();
1990 /// assert_eq!(sentences, vec![
1991 /// "I want this. ",
1992 /// "Not that. ",
1993 /// "Right now.",
1994 /// ]);
1995 /// ```
1996 #[cfg(feature = "unicode")]
1997 #[inline]
1998 fn sentences(&self) -> Sentences {
1999 Sentences::new(self.as_bytes())
2000 }
2001
2002 /// Returns an iterator over the sentences in this byte string along with
2003 /// their starting and ending byte index positions.
2004 ///
2005 /// Typically, a sentence will include its trailing punctuation and
2006 /// whitespace. Concatenating all elements yielded by the iterator
2007 /// results in the original string (modulo Unicode replacement codepoint
2008 /// substitutions if invalid UTF-8 is encountered).
2009 ///
2010 /// Since sentences are made up of one or more codepoints, this iterator
2011 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2012 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2013 ///
2014 /// # Examples
2015 ///
2016 /// Basic usage:
2017 ///
2018 /// ```
2019 /// use bstr::ByteSlice;
2020 ///
2021 /// let bs = b"I want this. Not that. Right now.";
2022 /// let sentences: Vec<(usize, usize, &str)> =
2023 /// bs.sentence_indices().collect();
2024 /// assert_eq!(sentences, vec![
2025 /// (0, 13, "I want this. "),
2026 /// (13, 23, "Not that. "),
2027 /// (23, 33, "Right now."),
2028 /// ]);
2029 /// ```
2030 #[cfg(feature = "unicode")]
2031 #[inline]
2032 fn sentence_indices(&self) -> SentenceIndices {
2033 SentenceIndices::new(self.as_bytes())
2034 }
2035
2036 /// An iterator over all lines in a byte string, without their
2037 /// terminators.
2038 ///
2039 /// For this iterator, the only line terminators recognized are `\r\n` and
2040 /// `\n`.
2041 ///
2042 /// # Examples
2043 ///
2044 /// Basic usage:
2045 ///
2046 /// ```
2047 /// use bstr::{B, ByteSlice};
2048 ///
2049 /// let s = b"\
2050 /// foo
2051 ///
2052 /// bar\r
2053 /// baz
2054 ///
2055 ///
2056 /// quux";
2057 /// let lines: Vec<&[u8]> = s.lines().collect();
2058 /// assert_eq!(lines, vec![
2059 /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
2060 /// ]);
2061 /// ```
2062 #[inline]
2063 fn lines(&self) -> Lines {
2064 Lines::new(self.as_bytes())
2065 }
2066
2067 /// An iterator over all lines in a byte string, including their
2068 /// terminators.
2069 ///
2070 /// For this iterator, the only line terminator recognized is `\n`. (Since
2071 /// line terminators are included, this also handles `\r\n` line endings.)
2072 ///
2073 /// Line terminators are only included if they are present in the original
2074 /// byte string. For example, the last line in a byte string may not end
2075 /// with a line terminator.
2076 ///
2077 /// Concatenating all elements yielded by this iterator is guaranteed to
2078 /// yield the original byte string.
2079 ///
2080 /// # Examples
2081 ///
2082 /// Basic usage:
2083 ///
2084 /// ```
2085 /// use bstr::{B, ByteSlice};
2086 ///
2087 /// let s = b"\
2088 /// foo
2089 ///
2090 /// bar\r
2091 /// baz
2092 ///
2093 ///
2094 /// quux";
2095 /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
2096 /// assert_eq!(lines, vec![
2097 /// B("foo\n"),
2098 /// B("\n"),
2099 /// B("bar\r\n"),
2100 /// B("baz\n"),
2101 /// B("\n"),
2102 /// B("\n"),
2103 /// B("quux"),
2104 /// ]);
2105 /// ```
2106 #[inline]
2107 fn lines_with_terminator(&self) -> LinesWithTerminator {
2108 LinesWithTerminator::new(self.as_bytes())
2109 }
2110
2111 /// Return a byte string slice with leading and trailing whitespace
2112 /// removed.
2113 ///
2114 /// Whitespace is defined according to the terms of the `White_Space`
2115 /// Unicode property.
2116 ///
2117 /// # Examples
2118 ///
2119 /// Basic usage:
2120 ///
2121 /// ```
2122 /// use bstr::{B, ByteSlice};
2123 ///
2124 /// let s = B(" foo\tbar\t\u{2003}\n");
2125 /// assert_eq!(s.trim(), B("foo\tbar"));
2126 /// ```
2127 #[cfg(feature = "unicode")]
2128 #[inline]
2129 fn trim(&self) -> &[u8] {
2130 self.trim_start().trim_end()
2131 }
2132
2133 /// Return a byte string slice with leading whitespace removed.
2134 ///
2135 /// Whitespace is defined according to the terms of the `White_Space`
2136 /// Unicode property.
2137 ///
2138 /// # Examples
2139 ///
2140 /// Basic usage:
2141 ///
2142 /// ```
2143 /// use bstr::{B, ByteSlice};
2144 ///
2145 /// let s = B(" foo\tbar\t\u{2003}\n");
2146 /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n"));
2147 /// ```
2148 #[cfg(feature = "unicode")]
2149 #[inline]
2150 fn trim_start(&self) -> &[u8] {
2151 let start = whitespace_len_fwd(self.as_bytes());
2152 &self.as_bytes()[start..]
2153 }
2154
2155 /// Return a byte string slice with trailing whitespace removed.
2156 ///
2157 /// Whitespace is defined according to the terms of the `White_Space`
2158 /// Unicode property.
2159 ///
2160 /// # Examples
2161 ///
2162 /// Basic usage:
2163 ///
2164 /// ```
2165 /// use bstr::{B, ByteSlice};
2166 ///
2167 /// let s = B(" foo\tbar\t\u{2003}\n");
2168 /// assert_eq!(s.trim_end(), B(" foo\tbar"));
2169 /// ```
2170 #[cfg(feature = "unicode")]
2171 #[inline]
2172 fn trim_end(&self) -> &[u8] {
2173 let end = whitespace_len_rev(self.as_bytes());
2174 &self.as_bytes()[..end]
2175 }
2176
2177 /// Return a byte string slice with leading and trailing characters
2178 /// satisfying the given predicate removed.
2179 ///
2180 /// # Examples
2181 ///
2182 /// Basic usage:
2183 ///
2184 /// ```
2185 /// use bstr::{B, ByteSlice};
2186 ///
2187 /// let s = b"123foo5bar789";
2188 /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
2189 /// ```
2190 #[inline]
2191 fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2192 self.trim_start_with(&mut trim).trim_end_with(&mut trim)
2193 }
2194
2195 /// Return a byte string slice with leading characters satisfying the given
2196 /// predicate removed.
2197 ///
2198 /// # Examples
2199 ///
2200 /// Basic usage:
2201 ///
2202 /// ```
2203 /// use bstr::{B, ByteSlice};
2204 ///
2205 /// let s = b"123foo5bar789";
2206 /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
2207 /// ```
2208 #[inline]
2209 fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2210 for (s, _, ch) in self.char_indices() {
2211 if !trim(ch) {
2212 return &self.as_bytes()[s..];
2213 }
2214 }
2215 b""
2216 }
2217
2218 /// Return a byte string slice with trailing characters satisfying the
2219 /// given predicate removed.
2220 ///
2221 /// # Examples
2222 ///
2223 /// Basic usage:
2224 ///
2225 /// ```
2226 /// use bstr::{B, ByteSlice};
2227 ///
2228 /// let s = b"123foo5bar";
2229 /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
2230 /// ```
2231 #[inline]
2232 fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2233 for (_, e, ch) in self.char_indices().rev() {
2234 if !trim(ch) {
2235 return &self.as_bytes()[..e];
2236 }
2237 }
2238 b""
2239 }
2240
2241 /// Returns a new `Vec<u8>` containing the lowercase equivalent of this
2242 /// byte string.
2243 ///
2244 /// In this case, lowercase is defined according to the `Lowercase` Unicode
2245 /// property.
2246 ///
2247 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2248 /// then it is written to the given buffer unchanged.
2249 ///
2250 /// Note that some characters in this byte string may expand into multiple
2251 /// characters when changing the case, so the number of bytes written to
2252 /// the given byte string may not be equivalent to the number of bytes in
2253 /// this byte string.
2254 ///
2255 /// If you'd like to reuse an allocation for performance reasons, then use
2256 /// [`to_lowercase_into`](#method.to_lowercase_into) instead.
2257 ///
2258 /// # Examples
2259 ///
2260 /// Basic usage:
2261 ///
2262 /// ```
2263 /// use bstr::{B, ByteSlice};
2264 ///
2265 /// let s = B("HELLO Β");
2266 /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes());
2267 /// ```
2268 ///
2269 /// Scripts without case are not changed:
2270 ///
2271 /// ```
2272 /// use bstr::{B, ByteSlice};
2273 ///
2274 /// let s = B("农历新年");
2275 /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes());
2276 /// ```
2277 ///
2278 /// Invalid UTF-8 remains as is:
2279 ///
2280 /// ```
2281 /// use bstr::{B, ByteSlice};
2282 ///
2283 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2284 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
2285 /// ```
2286 #[cfg(all(feature = "std", feature = "unicode"))]
2287 #[inline]
2288 fn to_lowercase(&self) -> Vec<u8> {
2289 let mut buf = vec![];
2290 self.to_lowercase_into(&mut buf);
2291 buf
2292 }
2293
2294 /// Writes the lowercase equivalent of this byte string into the given
2295 /// buffer. The buffer is not cleared before written to.
2296 ///
2297 /// In this case, lowercase is defined according to the `Lowercase`
2298 /// Unicode property.
2299 ///
2300 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2301 /// then it is written to the given buffer unchanged.
2302 ///
2303 /// Note that some characters in this byte string may expand into multiple
2304 /// characters when changing the case, so the number of bytes written to
2305 /// the given byte string may not be equivalent to the number of bytes in
2306 /// this byte string.
2307 ///
2308 /// If you don't need to amortize allocation and instead prefer
2309 /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead.
2310 ///
2311 /// # Examples
2312 ///
2313 /// Basic usage:
2314 ///
2315 /// ```
2316 /// use bstr::{B, ByteSlice};
2317 ///
2318 /// let s = B("HELLO Β");
2319 ///
2320 /// let mut buf = vec![];
2321 /// s.to_lowercase_into(&mut buf);
2322 /// assert_eq!("hello β".as_bytes(), buf.as_bytes());
2323 /// ```
2324 ///
2325 /// Scripts without case are not changed:
2326 ///
2327 /// ```
2328 /// use bstr::{B, ByteSlice};
2329 ///
2330 /// let s = B("农历新年");
2331 ///
2332 /// let mut buf = vec![];
2333 /// s.to_lowercase_into(&mut buf);
2334 /// assert_eq!("农历新年".as_bytes(), buf.as_bytes());
2335 /// ```
2336 ///
2337 /// Invalid UTF-8 remains as is:
2338 ///
2339 /// ```
2340 /// use bstr::{B, ByteSlice};
2341 ///
2342 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2343 ///
2344 /// let mut buf = vec![];
2345 /// s.to_lowercase_into(&mut buf);
2346 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
2347 /// ```
2348 #[cfg(all(feature = "std", feature = "unicode"))]
2349 #[inline]
2350 fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
2351 // TODO: This is the best we can do given what std exposes I think.
2352 // If we roll our own case handling, then we might be able to do this
2353 // a bit faster. We shouldn't roll our own case handling unless we
2354 // need to, e.g., for doing caseless matching or case folding.
2355
2356 // TODO(BUG): This doesn't handle any special casing rules.
2357
2358 buf.reserve(self.as_bytes().len());
2359 for (s, e, ch) in self.char_indices() {
2360 if ch == '\u{FFFD}' {
2361 buf.push_str(&self.as_bytes()[s..e]);
2362 } else if ch.is_ascii() {
2363 buf.push_char(ch.to_ascii_lowercase());
2364 } else {
2365 for upper in ch.to_lowercase() {
2366 buf.push_char(upper);
2367 }
2368 }
2369 }
2370 }
2371
2372 /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of
2373 /// this byte string.
2374 ///
2375 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2376 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2377 /// In particular, the length of the byte string returned is always
2378 /// equivalent to the length of this byte string.
2379 ///
2380 /// If you'd like to reuse an allocation for performance reasons, then use
2381 /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform
2382 /// the conversion in place.
2383 ///
2384 /// # Examples
2385 ///
2386 /// Basic usage:
2387 ///
2388 /// ```
2389 /// use bstr::{B, ByteSlice};
2390 ///
2391 /// let s = B("HELLO Β");
2392 /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes());
2393 /// ```
2394 ///
2395 /// Invalid UTF-8 remains as is:
2396 ///
2397 /// ```
2398 /// use bstr::{B, ByteSlice};
2399 ///
2400 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2401 /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
2402 /// ```
2403 #[cfg(feature = "std")]
2404 #[inline]
2405 fn to_ascii_lowercase(&self) -> Vec<u8> {
2406 self.as_bytes().to_ascii_lowercase()
2407 }
2408
2409 /// Convert this byte string to its lowercase ASCII equivalent in place.
2410 ///
2411 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2412 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2413 ///
2414 /// If you don't need to do the conversion in
2415 /// place and instead prefer convenience, then use
2416 /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead.
2417 ///
2418 /// # Examples
2419 ///
2420 /// Basic usage:
2421 ///
2422 /// ```
2423 /// use bstr::ByteSlice;
2424 ///
2425 /// let mut s = <Vec<u8>>::from("HELLO Β");
2426 /// s.make_ascii_lowercase();
2427 /// assert_eq!(s, "hello Β".as_bytes());
2428 /// ```
2429 ///
2430 /// Invalid UTF-8 remains as is:
2431 ///
2432 /// ```
2433 /// use bstr::{B, ByteSlice, ByteVec};
2434 ///
2435 /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
2436 /// s.make_ascii_lowercase();
2437 /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
2438 /// ```
2439 #[inline]
2440 fn make_ascii_lowercase(&mut self) {
2441 self.as_bytes_mut().make_ascii_lowercase();
2442 }
2443
2444 /// Returns a new `Vec<u8>` containing the uppercase equivalent of this
2445 /// byte string.
2446 ///
2447 /// In this case, uppercase is defined according to the `Uppercase`
2448 /// Unicode property.
2449 ///
2450 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2451 /// then it is written to the given buffer unchanged.
2452 ///
2453 /// Note that some characters in this byte string may expand into multiple
2454 /// characters when changing the case, so the number of bytes written to
2455 /// the given byte string may not be equivalent to the number of bytes in
2456 /// this byte string.
2457 ///
2458 /// If you'd like to reuse an allocation for performance reasons, then use
2459 /// [`to_uppercase_into`](#method.to_uppercase_into) instead.
2460 ///
2461 /// # Examples
2462 ///
2463 /// Basic usage:
2464 ///
2465 /// ```
2466 /// use bstr::{B, ByteSlice};
2467 ///
2468 /// let s = B("hello β");
2469 /// assert_eq!(s.to_uppercase(), B("HELLO Β"));
2470 /// ```
2471 ///
2472 /// Scripts without case are not changed:
2473 ///
2474 /// ```
2475 /// use bstr::{B, ByteSlice};
2476 ///
2477 /// let s = B("农历新年");
2478 /// assert_eq!(s.to_uppercase(), B("农历新年"));
2479 /// ```
2480 ///
2481 /// Invalid UTF-8 remains as is:
2482 ///
2483 /// ```
2484 /// use bstr::{B, ByteSlice};
2485 ///
2486 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2487 /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2488 /// ```
2489 #[cfg(all(feature = "std", feature = "unicode"))]
2490 #[inline]
2491 fn to_uppercase(&self) -> Vec<u8> {
2492 let mut buf = vec![];
2493 self.to_uppercase_into(&mut buf);
2494 buf
2495 }
2496
2497 /// Writes the uppercase equivalent of this byte string into the given
2498 /// buffer. The buffer is not cleared before written to.
2499 ///
2500 /// In this case, uppercase is defined according to the `Uppercase`
2501 /// Unicode property.
2502 ///
2503 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2504 /// then it is written to the given buffer unchanged.
2505 ///
2506 /// Note that some characters in this byte string may expand into multiple
2507 /// characters when changing the case, so the number of bytes written to
2508 /// the given byte string may not be equivalent to the number of bytes in
2509 /// this byte string.
2510 ///
2511 /// If you don't need to amortize allocation and instead prefer
2512 /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead.
2513 ///
2514 /// # Examples
2515 ///
2516 /// Basic usage:
2517 ///
2518 /// ```
2519 /// use bstr::{B, ByteSlice};
2520 ///
2521 /// let s = B("hello β");
2522 ///
2523 /// let mut buf = vec![];
2524 /// s.to_uppercase_into(&mut buf);
2525 /// assert_eq!(buf, B("HELLO Β"));
2526 /// ```
2527 ///
2528 /// Scripts without case are not changed:
2529 ///
2530 /// ```
2531 /// use bstr::{B, ByteSlice};
2532 ///
2533 /// let s = B("农历新年");
2534 ///
2535 /// let mut buf = vec![];
2536 /// s.to_uppercase_into(&mut buf);
2537 /// assert_eq!(buf, B("农历新年"));
2538 /// ```
2539 ///
2540 /// Invalid UTF-8 remains as is:
2541 ///
2542 /// ```
2543 /// use bstr::{B, ByteSlice};
2544 ///
2545 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2546 ///
2547 /// let mut buf = vec![];
2548 /// s.to_uppercase_into(&mut buf);
2549 /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2550 /// ```
2551 #[cfg(all(feature = "std", feature = "unicode"))]
2552 #[inline]
2553 fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
2554 // TODO: This is the best we can do given what std exposes I think.
2555 // If we roll our own case handling, then we might be able to do this
2556 // a bit faster. We shouldn't roll our own case handling unless we
2557 // need to, e.g., for doing caseless matching or case folding.
2558 buf.reserve(self.as_bytes().len());
2559 for (s, e, ch) in self.char_indices() {
2560 if ch == '\u{FFFD}' {
2561 buf.push_str(&self.as_bytes()[s..e]);
2562 } else if ch.is_ascii() {
2563 buf.push_char(ch.to_ascii_uppercase());
2564 } else {
2565 for upper in ch.to_uppercase() {
2566 buf.push_char(upper);
2567 }
2568 }
2569 }
2570 }
2571
2572 /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of
2573 /// this byte string.
2574 ///
2575 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2576 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2577 /// In particular, the length of the byte string returned is always
2578 /// equivalent to the length of this byte string.
2579 ///
2580 /// If you'd like to reuse an allocation for performance reasons, then use
2581 /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform
2582 /// the conversion in place.
2583 ///
2584 /// # Examples
2585 ///
2586 /// Basic usage:
2587 ///
2588 /// ```
2589 /// use bstr::{B, ByteSlice};
2590 ///
2591 /// let s = B("hello β");
2592 /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β"));
2593 /// ```
2594 ///
2595 /// Invalid UTF-8 remains as is:
2596 ///
2597 /// ```
2598 /// use bstr::{B, ByteSlice};
2599 ///
2600 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2601 /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2602 /// ```
2603 #[cfg(feature = "std")]
2604 #[inline]
2605 fn to_ascii_uppercase(&self) -> Vec<u8> {
2606 self.as_bytes().to_ascii_uppercase()
2607 }
2608
2609 /// Convert this byte string to its uppercase ASCII equivalent in place.
2610 ///
2611 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2612 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2613 ///
2614 /// If you don't need to do the conversion in
2615 /// place and instead prefer convenience, then use
2616 /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead.
2617 ///
2618 /// # Examples
2619 ///
2620 /// Basic usage:
2621 ///
2622 /// ```
2623 /// use bstr::{B, ByteSlice};
2624 ///
2625 /// let mut s = <Vec<u8>>::from("hello β");
2626 /// s.make_ascii_uppercase();
2627 /// assert_eq!(s, B("HELLO β"));
2628 /// ```
2629 ///
2630 /// Invalid UTF-8 remains as is:
2631 ///
2632 /// ```
2633 /// use bstr::{B, ByteSlice, ByteVec};
2634 ///
2635 /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
2636 /// s.make_ascii_uppercase();
2637 /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2638 /// ```
2639 #[inline]
2640 fn make_ascii_uppercase(&mut self) {
2641 self.as_bytes_mut().make_ascii_uppercase();
2642 }
2643
2644 /// Reverse the bytes in this string, in place.
2645 ///
2646 /// This is not necessarily a well formed operation! For example, if this
2647 /// byte string contains valid UTF-8 that isn't ASCII, then reversing the
2648 /// string will likely result in invalid UTF-8 and otherwise non-sensical
2649 /// content.
2650 ///
2651 /// Note that this is equivalent to the generic `[u8]::reverse` method.
2652 /// This method is provided to permit callers to explicitly differentiate
2653 /// between reversing bytes, codepoints and graphemes.
2654 ///
2655 /// # Examples
2656 ///
2657 /// Basic usage:
2658 ///
2659 /// ```
2660 /// use bstr::ByteSlice;
2661 ///
2662 /// let mut s = <Vec<u8>>::from("hello");
2663 /// s.reverse_bytes();
2664 /// assert_eq!(s, "olleh".as_bytes());
2665 /// ```
2666 #[inline]
2667 fn reverse_bytes(&mut self) {
2668 self.as_bytes_mut().reverse();
2669 }
2670
2671 /// Reverse the codepoints in this string, in place.
2672 ///
2673 /// If this byte string is valid UTF-8, then its reversal by codepoint
2674 /// is also guaranteed to be valid UTF-8.
2675 ///
2676 /// This operation is equivalent to the following, but without allocating:
2677 ///
2678 /// ```
2679 /// use bstr::ByteSlice;
2680 ///
2681 /// let mut s = <Vec<u8>>::from("foo☃bar");
2682 ///
2683 /// let mut chars: Vec<char> = s.chars().collect();
2684 /// chars.reverse();
2685 ///
2686 /// let reversed: String = chars.into_iter().collect();
2687 /// assert_eq!(reversed, "rab☃oof");
2688 /// ```
2689 ///
2690 /// Note that this is not necessarily a well formed operation. For example,
2691 /// if this byte string contains grapheme clusters with more than one
2692 /// codepoint, then those grapheme clusters will not necessarily be
2693 /// preserved. If you'd like to preserve grapheme clusters, then use
2694 /// [`reverse_graphemes`](#method.reverse_graphemes) instead.
2695 ///
2696 /// # Examples
2697 ///
2698 /// Basic usage:
2699 ///
2700 /// ```
2701 /// use bstr::ByteSlice;
2702 ///
2703 /// let mut s = <Vec<u8>>::from("foo☃bar");
2704 /// s.reverse_chars();
2705 /// assert_eq!(s, "rab☃oof".as_bytes());
2706 /// ```
2707 ///
2708 /// This example shows that not all reversals lead to a well formed string.
2709 /// For example, in this case, combining marks are used to put accents over
2710 /// some letters, and those accent marks must appear after the codepoints
2711 /// they modify.
2712 ///
2713 /// ```
2714 /// use bstr::{B, ByteSlice};
2715 ///
2716 /// let mut s = <Vec<u8>>::from("résumé");
2717 /// s.reverse_chars();
2718 /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
2719 /// ```
2720 ///
2721 /// A word of warning: the above example relies on the fact that
2722 /// `résumé` is in decomposed normal form, which means there are separate
2723 /// codepoints for the accents above `e`. If it is instead in composed
2724 /// normal form, then the example works:
2725 ///
2726 /// ```
2727 /// use bstr::{B, ByteSlice};
2728 ///
2729 /// let mut s = <Vec<u8>>::from("résumé");
2730 /// s.reverse_chars();
2731 /// assert_eq!(s, B("émusér"));
2732 /// ```
2733 ///
2734 /// The point here is to be cautious and not assume that just because
2735 /// `reverse_chars` works in one case, that it therefore works in all
2736 /// cases.
2737 #[inline]
2738 fn reverse_chars(&mut self) {
2739 let mut i = 0;
2740 loop {
2741 let (_, size) = utf8::decode(&self.as_bytes()[i..]);
2742 if size == 0 {
2743 break;
2744 }
2745 if size > 1 {
2746 self.as_bytes_mut()[i..i + size].reverse_bytes();
2747 }
2748 i += size;
2749 }
2750 self.reverse_bytes();
2751 }
2752
2753 /// Reverse the graphemes in this string, in place.
2754 ///
2755 /// If this byte string is valid UTF-8, then its reversal by grapheme
2756 /// is also guaranteed to be valid UTF-8.
2757 ///
2758 /// This operation is equivalent to the following, but without allocating:
2759 ///
2760 /// ```
2761 /// use bstr::ByteSlice;
2762 ///
2763 /// let mut s = <Vec<u8>>::from("foo☃bar");
2764 ///
2765 /// let mut graphemes: Vec<&str> = s.graphemes().collect();
2766 /// graphemes.reverse();
2767 ///
2768 /// let reversed = graphemes.concat();
2769 /// assert_eq!(reversed, "rab☃oof");
2770 /// ```
2771 ///
2772 /// # Examples
2773 ///
2774 /// Basic usage:
2775 ///
2776 /// ```
2777 /// use bstr::ByteSlice;
2778 ///
2779 /// let mut s = <Vec<u8>>::from("foo☃bar");
2780 /// s.reverse_graphemes();
2781 /// assert_eq!(s, "rab☃oof".as_bytes());
2782 /// ```
2783 ///
2784 /// This example shows how this correctly handles grapheme clusters,
2785 /// unlike `reverse_chars`.
2786 ///
2787 /// ```
2788 /// use bstr::ByteSlice;
2789 ///
2790 /// let mut s = <Vec<u8>>::from("résumé");
2791 /// s.reverse_graphemes();
2792 /// assert_eq!(s, "émusér".as_bytes());
2793 /// ```
2794 #[cfg(feature = "unicode")]
2795 #[inline]
2796 fn reverse_graphemes(&mut self) {
2797 use unicode::decode_grapheme;
2798
2799 let mut i = 0;
2800 loop {
2801 let (_, size) = decode_grapheme(&self.as_bytes()[i..]);
2802 if size == 0 {
2803 break;
2804 }
2805 if size > 1 {
2806 self.as_bytes_mut()[i..i + size].reverse_bytes();
2807 }
2808 i += size;
2809 }
2810 self.reverse_bytes();
2811 }
2812
2813 /// Returns true if and only if every byte in this byte string is ASCII.
2814 ///
2815 /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to
2816 /// an ASCII codepoint if and only if it is in the inclusive range
2817 /// `[0, 127]`.
2818 ///
2819 /// # Examples
2820 ///
2821 /// Basic usage:
2822 ///
2823 /// ```
2824 /// use bstr::{B, ByteSlice};
2825 ///
2826 /// assert!(B("abc").is_ascii());
2827 /// assert!(!B("☃βツ").is_ascii());
2828 /// assert!(!B(b"\xFF").is_ascii());
2829 /// ```
2830 #[inline]
2831 fn is_ascii(&self) -> bool {
2832 ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len()
2833 }
2834
2835 /// Returns true if and only if the entire byte string is valid UTF-8.
2836 ///
2837 /// If you need location information about where a byte string's first
2838 /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method.
2839 ///
2840 /// # Examples
2841 ///
2842 /// Basic usage:
2843 ///
2844 /// ```
2845 /// use bstr::{B, ByteSlice};
2846 ///
2847 /// assert!(B("abc").is_utf8());
2848 /// assert!(B("☃βツ").is_utf8());
2849 /// // invalid bytes
2850 /// assert!(!B(b"abc\xFF").is_utf8());
2851 /// // surrogate encoding
2852 /// assert!(!B(b"\xED\xA0\x80").is_utf8());
2853 /// // incomplete sequence
2854 /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
2855 /// // overlong sequence
2856 /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
2857 /// ```
2858 #[inline]
2859 fn is_utf8(&self) -> bool {
2860 utf8::validate(self.as_bytes()).is_ok()
2861 }
2862
2863 /// Returns the last byte in this byte string, if it's non-empty. If this
2864 /// byte string is empty, this returns `None`.
2865 ///
2866 /// Note that this is like the generic `[u8]::last`, except this returns
2867 /// the byte by value instead of a reference to the byte.
2868 ///
2869 /// # Examples
2870 ///
2871 /// Basic usage:
2872 ///
2873 /// ```
2874 /// use bstr::ByteSlice;
2875 ///
2876 /// assert_eq!(Some(b'z'), b"baz".last_byte());
2877 /// assert_eq!(None, b"".last_byte());
2878 /// ```
2879 #[inline]
2880 fn last_byte(&self) -> Option<u8> {
2881 let bytes = self.as_bytes();
2882 bytes.get(bytes.len().saturating_sub(1)).map(|&b| b)
2883 }
2884
2885 /// Returns the index of the first non-ASCII byte in this byte string (if
2886 /// any such indices exist). Specifically, it returns the index of the
2887 /// first byte with a value greater than or equal to `0x80`.
2888 ///
2889 /// # Examples
2890 ///
2891 /// Basic usage:
2892 ///
2893 /// ```
2894 /// use bstr::{ByteSlice, B};
2895 ///
2896 /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
2897 /// assert_eq!(None, b"abcde".find_non_ascii_byte());
2898 /// assert_eq!(Some(0), B("😀").find_non_ascii_byte());
2899 /// ```
2900 #[inline]
2901 fn find_non_ascii_byte(&self) -> Option<usize> {
2902 let index = ascii::first_non_ascii_byte(self.as_bytes());
2903 if index == self.as_bytes().len() {
2904 None
2905 } else {
2906 Some(index)
2907 }
2908 }
2909
2910 /// Copies elements from one part of the slice to another part of itself,
2911 /// where the parts may be overlapping.
2912 ///
2913 /// `src` is the range within this byte string to copy from, while `dest`
2914 /// is the starting index of the range within this byte string to copy to.
2915 /// The length indicated by `src` must be less than or equal to the number
2916 /// of bytes from `dest` to the end of the byte string.
2917 ///
2918 /// # Panics
2919 ///
2920 /// Panics if either range is out of bounds, or if `src` is too big to fit
2921 /// into `dest`, or if the end of `src` is before the start.
2922 ///
2923 /// # Examples
2924 ///
2925 /// Copying four bytes within a byte string:
2926 ///
2927 /// ```
2928 /// use bstr::{B, ByteSlice};
2929 ///
2930 /// let mut buf = *b"Hello, World!";
2931 /// let s = &mut buf;
2932 /// s.copy_within_str(1..5, 8);
2933 /// assert_eq!(s, B("Hello, Wello!"));
2934 /// ```
2935 #[inline]
2936 fn copy_within_str<R>(&mut self, src: R, dest: usize)
2937 where
2938 R: ops::RangeBounds<usize>,
2939 {
2940 // TODO: Deprecate this once slice::copy_within stabilizes.
2941 let src_start = match src.start_bound() {
2942 ops::Bound::Included(&n) => n,
2943 ops::Bound::Excluded(&n) => {
2944 n.checked_add(1).expect("attempted to index slice beyond max")
2945 }
2946 ops::Bound::Unbounded => 0,
2947 };
2948 let src_end = match src.end_bound() {
2949 ops::Bound::Included(&n) => {
2950 n.checked_add(1).expect("attempted to index slice beyond max")
2951 }
2952 ops::Bound::Excluded(&n) => n,
2953 ops::Bound::Unbounded => self.as_bytes().len(),
2954 };
2955 assert!(src_start <= src_end, "src end is before src start");
2956 assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
2957 let count = src_end - src_start;
2958 assert!(
2959 dest <= self.as_bytes().len() - count,
2960 "dest is out of bounds",
2961 );
2962
2963 // SAFETY: This is safe because we use ptr::copy to handle overlapping
2964 // copies, and is also safe because we've checked all the bounds above.
2965 // Finally, we are only dealing with u8 data, which is Copy, which
2966 // means we can copy without worrying about ownership/destructors.
2967 unsafe {
2968 ptr::copy(
2969 self.as_bytes().get_unchecked(src_start),
2970 self.as_bytes_mut().get_unchecked_mut(dest),
2971 count,
2972 );
2973 }
2974 }
2975 }
2976
2977 /// A single substring searcher fixed to a particular needle.
2978 ///
2979 /// The purpose of this type is to permit callers to construct a substring
2980 /// searcher that can be used to search haystacks without the overhead of
2981 /// constructing the searcher in the first place. This is a somewhat niche
2982 /// concern when it's necessary to re-use the same needle to search multiple
2983 /// different haystacks with as little overhead as possible. In general, using
2984 /// [`ByteSlice::find`](trait.ByteSlice.html#method.find)
2985 /// or
2986 /// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter)
2987 /// is good enough, but `Finder` is useful when you can meaningfully observe
2988 /// searcher construction time in a profile.
2989 ///
2990 /// When the `std` feature is enabled, then this type has an `into_owned`
2991 /// version which permits building a `Finder` that is not connected to the
2992 /// lifetime of its needle.
2993 #[derive(Clone, Debug)]
2994 pub struct Finder<'a> {
2995 searcher: TwoWay<'a>,
2996 }
2997
2998 impl<'a> Finder<'a> {
2999 /// Create a new finder for the given needle.
3000 #[inline]
3001 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
3002 Finder { searcher: TwoWay::forward(needle.as_ref()) }
3003 }
3004
3005 /// Convert this finder into its owned variant, such that it no longer
3006 /// borrows the needle.
3007 ///
3008 /// If this is already an owned finder, then this is a no-op. Otherwise,
3009 /// this copies the needle.
3010 ///
3011 /// This is only available when the `std` feature is enabled.
3012 #[cfg(feature = "std")]
3013 #[inline]
3014 pub fn into_owned(self) -> Finder<'static> {
3015 Finder { searcher: self.searcher.into_owned() }
3016 }
3017
3018 /// Returns the needle that this finder searches for.
3019 ///
3020 /// Note that the lifetime of the needle returned is tied to the lifetime
3021 /// of the finder, and may be shorter than the `'a` lifetime. Namely, a
3022 /// finder's needle can be either borrowed or owned, so the lifetime of the
3023 /// needle returned must necessarily be the shorter of the two.
3024 #[inline]
3025 pub fn needle(&self) -> &[u8] {
3026 self.searcher.needle()
3027 }
3028
3029 /// Returns the index of the first occurrence of this needle in the given
3030 /// haystack.
3031 ///
3032 /// The haystack may be any type that can be cheaply converted into a
3033 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3034 ///
3035 /// # Complexity
3036 ///
3037 /// This routine is guaranteed to have worst case linear time complexity
3038 /// with respect to both the needle and the haystack. That is, this runs
3039 /// in `O(needle.len() + haystack.len())` time.
3040 ///
3041 /// This routine is also guaranteed to have worst case constant space
3042 /// complexity.
3043 ///
3044 /// # Examples
3045 ///
3046 /// Basic usage:
3047 ///
3048 /// ```
3049 /// use bstr::Finder;
3050 ///
3051 /// let haystack = "foo bar baz";
3052 /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
3053 /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
3054 /// assert_eq!(None, Finder::new("quux").find(haystack));
3055 /// ```
3056 #[inline]
3057 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3058 self.searcher.find(haystack.as_ref())
3059 }
3060 }
3061
3062 /// A single substring reverse searcher fixed to a particular needle.
3063 ///
3064 /// The purpose of this type is to permit callers to construct a substring
3065 /// searcher that can be used to search haystacks without the overhead of
3066 /// constructing the searcher in the first place. This is a somewhat niche
3067 /// concern when it's necessary to re-use the same needle to search multiple
3068 /// different haystacks with as little overhead as possible. In general, using
3069 /// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind)
3070 /// or
3071 /// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter)
3072 /// is good enough, but `FinderReverse` is useful when you can meaningfully
3073 /// observe searcher construction time in a profile.
3074 ///
3075 /// When the `std` feature is enabled, then this type has an `into_owned`
3076 /// version which permits building a `FinderReverse` that is not connected to
3077 /// the lifetime of its needle.
3078 #[derive(Clone, Debug)]
3079 pub struct FinderReverse<'a> {
3080 searcher: TwoWay<'a>,
3081 }
3082
3083 impl<'a> FinderReverse<'a> {
3084 /// Create a new reverse finder for the given needle.
3085 #[inline]
3086 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
3087 FinderReverse { searcher: TwoWay::reverse(needle.as_ref()) }
3088 }
3089
3090 /// Convert this finder into its owned variant, such that it no longer
3091 /// borrows the needle.
3092 ///
3093 /// If this is already an owned finder, then this is a no-op. Otherwise,
3094 /// this copies the needle.
3095 ///
3096 /// This is only available when the `std` feature is enabled.
3097 #[cfg(feature = "std")]
3098 #[inline]
3099 pub fn into_owned(self) -> FinderReverse<'static> {
3100 FinderReverse { searcher: self.searcher.into_owned() }
3101 }
3102
3103 /// Returns the needle that this finder searches for.
3104 ///
3105 /// Note that the lifetime of the needle returned is tied to the lifetime
3106 /// of this finder, and may be shorter than the `'a` lifetime. Namely,
3107 /// a finder's needle can be either borrowed or owned, so the lifetime of
3108 /// the needle returned must necessarily be the shorter of the two.
3109 #[inline]
3110 pub fn needle(&self) -> &[u8] {
3111 self.searcher.needle()
3112 }
3113
3114 /// Returns the index of the last occurrence of this needle in the given
3115 /// haystack.
3116 ///
3117 /// The haystack may be any type that can be cheaply converted into a
3118 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3119 ///
3120 /// # Complexity
3121 ///
3122 /// This routine is guaranteed to have worst case linear time complexity
3123 /// with respect to both the needle and the haystack. That is, this runs
3124 /// in `O(needle.len() + haystack.len())` time.
3125 ///
3126 /// This routine is also guaranteed to have worst case constant space
3127 /// complexity.
3128 ///
3129 /// # Examples
3130 ///
3131 /// Basic usage:
3132 ///
3133 /// ```
3134 /// use bstr::FinderReverse;
3135 ///
3136 /// let haystack = "foo bar baz";
3137 /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
3138 /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
3139 /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
3140 /// ```
3141 #[inline]
3142 pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3143 self.searcher.rfind(haystack.as_ref())
3144 }
3145 }
3146
3147 /// An iterator over non-overlapping substring matches.
3148 ///
3149 /// Matches are reported by the byte offset at which they begin.
3150 ///
3151 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3152 /// byte string being looked for.
3153 #[derive(Debug)]
3154 pub struct Find<'a> {
3155 haystack: &'a [u8],
3156 prestate: PrefilterState,
3157 searcher: TwoWay<'a>,
3158 pos: usize,
3159 }
3160
3161 impl<'a> Find<'a> {
3162 fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
3163 let searcher = TwoWay::forward(needle);
3164 let prestate = searcher.prefilter_state();
3165 Find { haystack, prestate, searcher, pos: 0 }
3166 }
3167 }
3168
3169 impl<'a> Iterator for Find<'a> {
3170 type Item = usize;
3171
3172 #[inline]
3173 fn next(&mut self) -> Option<usize> {
3174 if self.pos > self.haystack.len() {
3175 return None;
3176 }
3177 let result = self
3178 .searcher
3179 .find_with(&mut self.prestate, &self.haystack[self.pos..]);
3180 match result {
3181 None => None,
3182 Some(i) => {
3183 let pos = self.pos + i;
3184 self.pos = pos + cmp::max(1, self.searcher.needle().len());
3185 Some(pos)
3186 }
3187 }
3188 }
3189 }
3190
3191 /// An iterator over non-overlapping substring matches in reverse.
3192 ///
3193 /// Matches are reported by the byte offset at which they begin.
3194 ///
3195 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3196 /// byte string being looked for.
3197 #[derive(Debug)]
3198 pub struct FindReverse<'a> {
3199 haystack: &'a [u8],
3200 prestate: PrefilterState,
3201 searcher: TwoWay<'a>,
3202 /// When searching with an empty needle, this gets set to `None` after
3203 /// we've yielded the last element at `0`.
3204 pos: Option<usize>,
3205 }
3206
3207 impl<'a> FindReverse<'a> {
3208 fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
3209 let searcher = TwoWay::reverse(needle);
3210 let prestate = searcher.prefilter_state();
3211 let pos = Some(haystack.len());
3212 FindReverse { haystack, prestate, searcher, pos }
3213 }
3214
3215 fn haystack(&self) -> &'a [u8] {
3216 self.haystack
3217 }
3218
3219 fn needle(&self) -> &[u8] {
3220 self.searcher.needle()
3221 }
3222 }
3223
3224 impl<'a> Iterator for FindReverse<'a> {
3225 type Item = usize;
3226
3227 #[inline]
3228 fn next(&mut self) -> Option<usize> {
3229 let pos = match self.pos {
3230 None => return None,
3231 Some(pos) => pos,
3232 };
3233 let result = self
3234 .searcher
3235 .rfind_with(&mut self.prestate, &self.haystack[..pos]);
3236 match result {
3237 None => None,
3238 Some(i) => {
3239 if pos == i {
3240 self.pos = pos.checked_sub(1);
3241 } else {
3242 self.pos = Some(i);
3243 }
3244 Some(i)
3245 }
3246 }
3247 }
3248 }
3249
3250 /// An iterator over the bytes in a byte string.
3251 ///
3252 /// `'a` is the lifetime of the byte string being traversed.
3253 #[derive(Clone, Debug)]
3254 pub struct Bytes<'a> {
3255 it: slice::Iter<'a, u8>,
3256 }
3257
3258 impl<'a> Iterator for Bytes<'a> {
3259 type Item = u8;
3260
3261 #[inline]
3262 fn next(&mut self) -> Option<u8> {
3263 self.it.next().map(|&b| b)
3264 }
3265 }
3266
3267 impl<'a> DoubleEndedIterator for Bytes<'a> {
3268 #[inline]
3269 fn next_back(&mut self) -> Option<u8> {
3270 self.it.next_back().map(|&b| b)
3271 }
3272 }
3273
3274 impl<'a> ExactSizeIterator for Bytes<'a> {
3275 #[inline]
3276 fn len(&self) -> usize {
3277 self.it.len()
3278 }
3279 }
3280
3281 /// An iterator over the fields in a byte string, separated by whitespace.
3282 ///
3283 /// This iterator splits on contiguous runs of whitespace, such that the fields
3284 /// in `foo\t\t\n \nbar` are `foo` and `bar`.
3285 ///
3286 /// `'a` is the lifetime of the byte string being split.
3287 #[derive(Debug)]
3288 pub struct Fields<'a> {
3289 it: FieldsWith<'a, fn(char) -> bool>,
3290 }
3291
3292 impl<'a> Fields<'a> {
3293 fn new(bytes: &'a [u8]) -> Fields<'a> {
3294 Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
3295 }
3296 }
3297
3298 impl<'a> Iterator for Fields<'a> {
3299 type Item = &'a [u8];
3300
3301 #[inline]
3302 fn next(&mut self) -> Option<&'a [u8]> {
3303 self.it.next()
3304 }
3305 }
3306
3307 /// An iterator over fields in the byte string, separated by a predicate over
3308 /// codepoints.
3309 ///
3310 /// This iterator splits a byte string based on its predicate function such
3311 /// that the elements returned are separated by contiguous runs of codepoints
3312 /// for which the predicate returns true.
3313 ///
3314 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3315 /// of the predicate, i.e., `FnMut(char) -> bool`.
3316 #[derive(Debug)]
3317 pub struct FieldsWith<'a, F> {
3318 f: F,
3319 bytes: &'a [u8],
3320 chars: CharIndices<'a>,
3321 }
3322
3323 impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> {
3324 fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> {
3325 FieldsWith { f, bytes, chars: bytes.char_indices() }
3326 }
3327 }
3328
3329 impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
3330 type Item = &'a [u8];
3331
3332 #[inline]
3333 fn next(&mut self) -> Option<&'a [u8]> {
3334 let (start, mut end);
3335 loop {
3336 match self.chars.next() {
3337 None => return None,
3338 Some((s, e, ch)) => {
3339 if !(self.f)(ch) {
3340 start = s;
3341 end = e;
3342 break;
3343 }
3344 }
3345 }
3346 }
3347 while let Some((_, e, ch)) = self.chars.next() {
3348 if (self.f)(ch) {
3349 break;
3350 }
3351 end = e;
3352 }
3353 Some(&self.bytes[start..end])
3354 }
3355 }
3356
3357 /// An iterator over substrings in a byte string, split by a separator.
3358 ///
3359 /// `'a` is the lifetime of the byte string being split.
3360 #[derive(Debug)]
3361 pub struct Split<'a> {
3362 finder: Find<'a>,
3363 /// The end position of the previous match of our splitter. The element
3364 /// we yield corresponds to the substring starting at `last` up to the
3365 /// beginning of the next match of the splitter.
3366 last: usize,
3367 /// Only set when iteration is complete. A corner case here is when a
3368 /// splitter is matched at the end of the haystack. At that point, we still
3369 /// need to yield an empty string following it.
3370 done: bool,
3371 }
3372
3373 impl<'a> Split<'a> {
3374 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
3375 let finder = haystack.find_iter(splitter);
3376 Split { finder, last: 0, done: false }
3377 }
3378 }
3379
3380 impl<'a> Iterator for Split<'a> {
3381 type Item = &'a [u8];
3382
3383 #[inline]
3384 fn next(&mut self) -> Option<&'a [u8]> {
3385 let haystack = self.finder.haystack;
3386 match self.finder.next() {
3387 Some(start) => {
3388 let next = &haystack[self.last..start];
3389 self.last = start + self.finder.searcher.needle().len();
3390 Some(next)
3391 }
3392 None => {
3393 if self.last >= haystack.len() {
3394 if !self.done {
3395 self.done = true;
3396 Some(b"")
3397 } else {
3398 None
3399 }
3400 } else {
3401 let s = &haystack[self.last..];
3402 self.last = haystack.len();
3403 self.done = true;
3404 Some(s)
3405 }
3406 }
3407 }
3408 }
3409 }
3410
3411 /// An iterator over substrings in a byte string, split by a separator, in
3412 /// reverse.
3413 ///
3414 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3415 /// of the predicate, i.e., `FnMut(char) -> bool`.
3416 #[derive(Debug)]
3417 pub struct SplitReverse<'a> {
3418 finder: FindReverse<'a>,
3419 /// The end position of the previous match of our splitter. The element
3420 /// we yield corresponds to the substring starting at `last` up to the
3421 /// beginning of the next match of the splitter.
3422 last: usize,
3423 /// Only set when iteration is complete. A corner case here is when a
3424 /// splitter is matched at the end of the haystack. At that point, we still
3425 /// need to yield an empty string following it.
3426 done: bool,
3427 }
3428
3429 impl<'a> SplitReverse<'a> {
3430 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
3431 let finder = haystack.rfind_iter(splitter);
3432 SplitReverse { finder, last: haystack.len(), done: false }
3433 }
3434 }
3435
3436 impl<'a> Iterator for SplitReverse<'a> {
3437 type Item = &'a [u8];
3438
3439 #[inline]
3440 fn next(&mut self) -> Option<&'a [u8]> {
3441 let haystack = self.finder.haystack();
3442 match self.finder.next() {
3443 Some(start) => {
3444 let nlen = self.finder.needle().len();
3445 let next = &haystack[start + nlen..self.last];
3446 self.last = start;
3447 Some(next)
3448 }
3449 None => {
3450 if self.last == 0 {
3451 if !self.done {
3452 self.done = true;
3453 Some(b"")
3454 } else {
3455 None
3456 }
3457 } else {
3458 let s = &haystack[..self.last];
3459 self.last = 0;
3460 self.done = true;
3461 Some(s)
3462 }
3463 }
3464 }
3465 }
3466 }
3467
3468 /// An iterator over at most `n` substrings in a byte string, split by a
3469 /// separator.
3470 ///
3471 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3472 /// of the predicate, i.e., `FnMut(char) -> bool`.
3473 #[derive(Debug)]
3474 pub struct SplitN<'a> {
3475 split: Split<'a>,
3476 limit: usize,
3477 count: usize,
3478 }
3479
3480 impl<'a> SplitN<'a> {
3481 fn new(
3482 haystack: &'a [u8],
3483 splitter: &'a [u8],
3484 limit: usize,
3485 ) -> SplitN<'a> {
3486 let split = haystack.split_str(splitter);
3487 SplitN { split, limit, count: 0 }
3488 }
3489 }
3490
3491 impl<'a> Iterator for SplitN<'a> {
3492 type Item = &'a [u8];
3493
3494 #[inline]
3495 fn next(&mut self) -> Option<&'a [u8]> {
3496 self.count += 1;
3497 if self.count > self.limit || self.split.done {
3498 None
3499 } else if self.count == self.limit {
3500 Some(&self.split.finder.haystack[self.split.last..])
3501 } else {
3502 self.split.next()
3503 }
3504 }
3505 }
3506
3507 /// An iterator over at most `n` substrings in a byte string, split by a
3508 /// separator, in reverse.
3509 ///
3510 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3511 /// of the predicate, i.e., `FnMut(char) -> bool`.
3512 #[derive(Debug)]
3513 pub struct SplitNReverse<'a> {
3514 split: SplitReverse<'a>,
3515 limit: usize,
3516 count: usize,
3517 }
3518
3519 impl<'a> SplitNReverse<'a> {
3520 fn new(
3521 haystack: &'a [u8],
3522 splitter: &'a [u8],
3523 limit: usize,
3524 ) -> SplitNReverse<'a> {
3525 let split = haystack.rsplit_str(splitter);
3526 SplitNReverse { split, limit, count: 0 }
3527 }
3528 }
3529
3530 impl<'a> Iterator for SplitNReverse<'a> {
3531 type Item = &'a [u8];
3532
3533 #[inline]
3534 fn next(&mut self) -> Option<&'a [u8]> {
3535 self.count += 1;
3536 if self.count > self.limit || self.split.done {
3537 None
3538 } else if self.count == self.limit {
3539 Some(&self.split.finder.haystack()[..self.split.last])
3540 } else {
3541 self.split.next()
3542 }
3543 }
3544 }
3545
3546 /// An iterator over all lines in a byte string, without their terminators.
3547 ///
3548 /// For this iterator, the only line terminators recognized are `\r\n` and
3549 /// `\n`.
3550 ///
3551 /// `'a` is the lifetime of the byte string being iterated over.
3552 pub struct Lines<'a> {
3553 it: LinesWithTerminator<'a>,
3554 }
3555
3556 impl<'a> Lines<'a> {
3557 fn new(bytes: &'a [u8]) -> Lines<'a> {
3558 Lines { it: LinesWithTerminator::new(bytes) }
3559 }
3560 }
3561
3562 impl<'a> Iterator for Lines<'a> {
3563 type Item = &'a [u8];
3564
3565 #[inline]
3566 fn next(&mut self) -> Option<&'a [u8]> {
3567 let mut line = self.it.next()?;
3568 if line.last_byte() == Some(b'\n') {
3569 line = &line[..line.len() - 1];
3570 if line.last_byte() == Some(b'\r') {
3571 line = &line[..line.len() - 1];
3572 }
3573 }
3574 Some(line)
3575 }
3576 }
3577
3578 /// An iterator over all lines in a byte string, including their terminators.
3579 ///
3580 /// For this iterator, the only line terminator recognized is `\n`. (Since
3581 /// line terminators are included, this also handles `\r\n` line endings.)
3582 ///
3583 /// Line terminators are only included if they are present in the original
3584 /// byte string. For example, the last line in a byte string may not end with
3585 /// a line terminator.
3586 ///
3587 /// Concatenating all elements yielded by this iterator is guaranteed to yield
3588 /// the original byte string.
3589 ///
3590 /// `'a` is the lifetime of the byte string being iterated over.
3591 pub struct LinesWithTerminator<'a> {
3592 bytes: &'a [u8],
3593 }
3594
3595 impl<'a> LinesWithTerminator<'a> {
3596 fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
3597 LinesWithTerminator { bytes }
3598 }
3599 }
3600
3601 impl<'a> Iterator for LinesWithTerminator<'a> {
3602 type Item = &'a [u8];
3603
3604 #[inline]
3605 fn next(&mut self) -> Option<&'a [u8]> {
3606 match self.bytes.find_byte(b'\n') {
3607 None if self.bytes.is_empty() => None,
3608 None => {
3609 let line = self.bytes;
3610 self.bytes = b"";
3611 Some(line)
3612 }
3613 Some(end) => {
3614 let line = &self.bytes[..end + 1];
3615 self.bytes = &self.bytes[end + 1..];
3616 Some(line)
3617 }
3618 }
3619 }
3620 }
3621
3622 #[cfg(test)]
3623 mod tests {
3624 use ext_slice::{ByteSlice, B};
3625 use tests::LOSSY_TESTS;
3626
3627 #[test]
3628 fn to_str_lossy() {
3629 for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
3630 let got = B(input).to_str_lossy();
3631 assert_eq!(
3632 expected.as_bytes(),
3633 got.as_bytes(),
3634 "to_str_lossy(ith: {:?}, given: {:?})",
3635 i,
3636 input,
3637 );
3638
3639 let mut got = String::new();
3640 B(input).to_str_lossy_into(&mut got);
3641 assert_eq!(
3642 expected.as_bytes(),
3643 got.as_bytes(),
3644 "to_str_lossy_into",
3645 );
3646
3647 let got = String::from_utf8_lossy(input);
3648 assert_eq!(expected.as_bytes(), got.as_bytes(), "std");
3649 }
3650 }
3651
3652 #[test]
3653 #[should_panic]
3654 fn copy_within_fail1() {
3655 let mut buf = *b"foobar";
3656 let s = &mut buf;
3657 s.copy_within_str(0..2, 5);
3658 }
3659
3660 #[test]
3661 #[should_panic]
3662 fn copy_within_fail2() {
3663 let mut buf = *b"foobar";
3664 let s = &mut buf;
3665 s.copy_within_str(3..2, 0);
3666 }
3667
3668 #[test]
3669 #[should_panic]
3670 fn copy_within_fail3() {
3671 let mut buf = *b"foobar";
3672 let s = &mut buf;
3673 s.copy_within_str(5..7, 0);
3674 }
3675
3676 #[test]
3677 #[should_panic]
3678 fn copy_within_fail4() {
3679 let mut buf = *b"foobar";
3680 let s = &mut buf;
3681 s.copy_within_str(0..1, 6);
3682 }
3683 }