src/vendor/regex-0.2.10/src/re_bytes.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::borrow::Cow;
  12 use std::collections::HashMap;
  13 use std::fmt;
  14 use std::ops::Index;
  15 use std::str::FromStr;
  16 use std::sync::Arc;
  17
  18 use memchr::memchr;
  19
  20 use exec::{Exec, ExecNoSync};
  21 use expand::expand_bytes;
  22 use error::Error;
  23 use re_builder::bytes::RegexBuilder;
  24 use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
  25
  26 /// Match represents a single match of a regex in a haystack.
  27 ///
  28 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
  29 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
  30 pub struct Match<'t> {
  31     text: &'t [u8],
  32     start: usize,
  33     end: usize,
  34 }
  35
  36 impl<'t> Match<'t> {
  37     /// Returns the starting byte offset of the match in the haystack.
  38     #[inline]
  39     pub fn start(&self) -> usize {
  40         self.start
  41     }
  42
  43     /// Returns the ending byte offset of the match in the haystack.
  44     #[inline]
  45     pub fn end(&self) -> usize {
  46         self.end
  47     }
  48
  49     /// Returns the matched text.
  50     #[inline]
  51     pub fn as_bytes(&self) -> &'t [u8] {
  52         &self.text[self.start..self.end]
  53     }
  54
  55     /// Creates a new match from the given haystack and byte offsets.
  56     #[inline]
  57     fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> {
  58         Match {
  59             text: haystack,
  60             start: start,
  61             end: end,
  62         }
  63     }
  64 }
  65
  66 /// A compiled regular expression for matching arbitrary bytes.
  67 ///
  68 /// It can be used to search, split or replace text. All searching is done with
  69 /// an implicit `.*?` at the beginning and end of an expression. To force an
  70 /// expression to match the whole string (or a prefix or a suffix), you must
  71 /// use an anchor like `^` or `$` (or `\A` and `\z`).
  72 ///
  73 /// Like the `Regex` type in the parent module, matches with this regex return
  74 /// byte offsets into the search text. **Unlike** the parent `Regex` type,
  75 /// these byte offsets may not correspond to UTF-8 sequence boundaries since
  76 /// the regexes in this module can match arbitrary bytes.
  77 #[derive(Clone)]
  78 pub struct Regex(Exec);
  79
  80 impl fmt::Display for Regex {
  81     /// Shows the original regular expression.
  82     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  83         write!(f, "{}", self.as_str())
  84     }
  85 }
  86
  87 impl fmt::Debug for Regex {
  88     /// Shows the original regular expression.
  89     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  90         fmt::Display::fmt(self, f)
  91     }
  92 }
  93
  94 /// A constructor for Regex from an Exec.
  95 ///
  96 /// This is hidden because Exec isn't actually part of the public API.
  97 #[doc(hidden)]
  98 impl From<Exec> for Regex {
  99     fn from(exec: Exec) -> Regex {
 100         Regex(exec)
 101     }
 102 }
 103
 104 impl FromStr for Regex {
 105     type Err = Error;
 106
 107     /// Attempts to parse a string into a regular expression
 108     fn from_str(s: &str) -> Result<Regex, Error> {
 109         Regex::new(s)
 110     }
 111 }
 112
 113 /// Core regular expression methods.
 114 impl Regex {
 115     /// Compiles a regular expression. Once compiled, it can be used repeatedly
 116     /// to search, split or replace text in a string.
 117     ///
 118     /// If an invalid expression is given, then an error is returned.
 119     pub fn new(re: &str) -> Result<Regex, Error> {
 120         RegexBuilder::new(re).build()
 121     }
 122
 123     /// Returns true if and only if the regex matches the string given.
 124     ///
 125     /// It is recommended to use this method if all you need to do is test
 126     /// a match, since the underlying matching engine may be able to do less
 127     /// work.
 128     ///
 129     /// # Example
 130     ///
 131     /// Test if some text contains at least one word with exactly 13 ASCII word
 132     /// bytes:
 133     ///
 134     /// ```rust
 135     /// # extern crate regex; use regex::bytes::Regex;
 136     /// # fn main() {
 137     /// let text = b"I categorically deny having triskaidekaphobia.";
 138     /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
 139     /// # }
 140     /// ```
 141     pub fn is_match(&self, text: &[u8]) -> bool {
 142         self.is_match_at(text, 0)
 143     }
 144
 145     /// Returns the start and end byte range of the leftmost-first match in
 146     /// `text`. If no match exists, then `None` is returned.
 147     ///
 148     /// Note that this should only be used if you want to discover the position
 149     /// of the match. Testing the existence of a match is faster if you use
 150     /// `is_match`.
 151     ///
 152     /// # Example
 153     ///
 154     /// Find the start and end location of the first word with exactly 13
 155     /// ASCII word bytes:
 156     ///
 157     /// ```rust
 158     /// # extern crate regex; use regex::bytes::Regex;
 159     /// # fn main() {
 160     /// let text = b"I categorically deny having triskaidekaphobia.";
 161     /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
 162     /// assert_eq!((mat.start(), mat.end()), (2, 15));
 163     /// # }
 164     /// ```
 165     pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> {
 166         self.find_at(text, 0)
 167     }
 168
 169     /// Returns an iterator for each successive non-overlapping match in
 170     /// `text`, returning the start and end byte indices with respect to
 171     /// `text`.
 172     ///
 173     /// # Example
 174     ///
 175     /// Find the start and end location of every word with exactly 13 ASCII
 176     /// word bytes:
 177     ///
 178     /// ```rust
 179     /// # extern crate regex; use regex::bytes::Regex;
 180     /// # fn main() {
 181     /// let text = b"Retroactively relinquishing remunerations is reprehensible.";
 182     /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
 183     ///     println!("{:?}", mat);
 184     /// }
 185     /// # }
 186     /// ```
 187     pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> {
 188         Matches(self.0.searcher().find_iter(text))
 189     }
 190
 191     /// Returns the capture groups corresponding to the leftmost-first
 192     /// match in `text`. Capture group `0` always corresponds to the entire
 193     /// match. If no match is found, then `None` is returned.
 194     ///
 195     /// You should only use `captures` if you need access to the location of
 196     /// capturing group matches. Otherwise, `find` is faster for discovering
 197     /// the location of the overall match.
 198     ///
 199     /// # Examples
 200     ///
 201     /// Say you have some text with movie names and their release years,
 202     /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
 203     /// looking like that, while also extracting the movie name and its release
 204     /// year separately.
 205     ///
 206     /// ```rust
 207     /// # extern crate regex; use regex::bytes::Regex;
 208     /// # fn main() {
 209     /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
 210     /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
 211     /// let caps = re.captures(text).unwrap();
 212     /// assert_eq!(&caps[1], &b"Citizen Kane"[..]);
 213     /// assert_eq!(&caps[2], &b"1941"[..]);
 214     /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
 215     /// // You can also access the groups by index using the Index notation.
 216     /// // Note that this will panic on an invalid index.
 217     /// assert_eq!(&caps[1], b"Citizen Kane");
 218     /// assert_eq!(&caps[2], b"1941");
 219     /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
 220     /// # }
 221     /// ```
 222     ///
 223     /// Note that the full match is at capture group `0`. Each subsequent
 224     /// capture group is indexed by the order of its opening `(`.
 225     ///
 226     /// We can make this example a bit clearer by using *named* capture groups:
 227     ///
 228     /// ```rust
 229     /// # extern crate regex; use regex::bytes::Regex;
 230     /// # fn main() {
 231     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 232     ///                .unwrap();
 233     /// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
 234     /// let caps = re.captures(text).unwrap();
 235     /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]);
 236     /// assert_eq!(&caps["year"], &b"1941"[..]);
 237     /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]);
 238     /// // You can also access the groups by name using the Index notation.
 239     /// // Note that this will panic on an invalid group name.
 240     /// assert_eq!(&caps["title"], b"Citizen Kane");
 241     /// assert_eq!(&caps["year"], b"1941");
 242     /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)");
 243     ///
 244     /// # }
 245     /// ```
 246     ///
 247     /// Here we name the capture groups, which we can access with the `name`
 248     /// method or the `Index` notation with a `&str`. Note that the named
 249     /// capture groups are still accessible with `get` or the `Index` notation
 250     /// with a `usize`.
 251     ///
 252     /// The `0`th capture group is always unnamed, so it must always be
 253     /// accessed with `get(0)` or `[0]`.
 254     pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
 255         let mut locs = self.locations();
 256         self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
 257             text: text,
 258             locs: locs,
 259             named_groups: self.0.capture_name_idx().clone(),
 260         })
 261     }
 262
 263     /// Returns an iterator over all the non-overlapping capture groups matched
 264     /// in `text`. This is operationally the same as `find_iter`, except it
 265     /// yields information about capturing group matches.
 266     ///
 267     /// # Example
 268     ///
 269     /// We can use this to find all movie titles and their release years in
 270     /// some text, where the movie is formatted like "'Title' (xxxx)":
 271     ///
 272     /// ```rust
 273     /// # extern crate regex; use std::str; use regex::bytes::Regex;
 274     /// # fn main() {
 275     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 276     ///                .unwrap();
 277     /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
 278     /// for caps in re.captures_iter(text) {
 279     ///     let title = str::from_utf8(&caps["title"]).unwrap();
 280     ///     let year = str::from_utf8(&caps["year"]).unwrap();
 281     ///     println!("Movie: {:?}, Released: {:?}", title, year);
 282     /// }
 283     /// // Output:
 284     /// // Movie: Citizen Kane, Released: 1941
 285     /// // Movie: The Wizard of Oz, Released: 1939
 286     /// // Movie: M, Released: 1931
 287     /// # }
 288     /// ```
 289     pub fn captures_iter<'r, 't>(
 290         &'r self,
 291         text: &'t [u8],
 292     ) -> CaptureMatches<'r, 't> {
 293         CaptureMatches(self.0.searcher().captures_iter(text))
 294     }
 295
 296     /// Returns an iterator of substrings of `text` delimited by a match of the
 297     /// regular expression. Namely, each element of the iterator corresponds to
 298     /// text that *isn't* matched by the regular expression.
 299     ///
 300     /// This method will *not* copy the text given.
 301     ///
 302     /// # Example
 303     ///
 304     /// To split a string delimited by arbitrary amounts of spaces or tabs:
 305     ///
 306     /// ```rust
 307     /// # extern crate regex; use regex::bytes::Regex;
 308     /// # fn main() {
 309     /// let re = Regex::new(r"[ \t]+").unwrap();
 310     /// let fields: Vec<&[u8]> = re.split(b"a b \t  c\td    e").collect();
 311     /// assert_eq!(fields, vec![
 312     ///     &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..],
 313     /// ]);
 314     /// # }
 315     /// ```
 316     pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> {
 317         Split {
 318             finder: self.find_iter(text),
 319             last: 0,
 320         }
 321     }
 322
 323     /// Returns an iterator of at most `limit` substrings of `text` delimited
 324     /// by a match of the regular expression. (A `limit` of `0` will return no
 325     /// substrings.) Namely, each element of the iterator corresponds to text
 326     /// that *isn't* matched by the regular expression. The remainder of the
 327     /// string that is not split will be the last element in the iterator.
 328     ///
 329     /// This method will *not* copy the text given.
 330     ///
 331     /// # Example
 332     ///
 333     /// Get the first two words in some text:
 334     ///
 335     /// ```rust
 336     /// # extern crate regex; use regex::bytes::Regex;
 337     /// # fn main() {
 338     /// let re = Regex::new(r"\W+").unwrap();
 339     /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect();
 340     /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]);
 341     /// # }
 342     /// ```
 343     pub fn splitn<'r, 't>(
 344         &'r self,
 345         text: &'t [u8],
 346         limit: usize,
 347     ) -> SplitN<'r, 't> {
 348         SplitN {
 349             splits: self.split(text),
 350             n: limit,
 351         }
 352     }
 353
 354     /// Replaces the leftmost-first match with the replacement provided. The
 355     /// replacement can be a regular byte string (where `$N` and `$name` are
 356     /// expanded to match capture groups) or a function that takes the matches'
 357     /// `Captures` and returns the replaced byte string.
 358     ///
 359     /// If no match is found, then a copy of the byte string is returned
 360     /// unchanged.
 361     ///
 362     /// # Replacement string syntax
 363     ///
 364     /// All instances of `$name` in the replacement text is replaced with the
 365     /// corresponding capture group `name`.
 366     ///
 367     /// `name` may be an integer corresponding to the index of the
 368     /// capture group (counted by order of opening parenthesis where `0` is the
 369     /// entire match) or it can be a name (consisting of letters, digits or
 370     /// underscores) corresponding to a named capture group.
 371     ///
 372     /// If `name` isn't a valid capture group (whether the name doesn't exist
 373     /// or isn't a valid index), then it is replaced with the empty string.
 374     ///
 375     /// The longest possible name is used. e.g., `$1a` looks up the capture
 376     /// group named `1a` and not the capture group at index `1`. To exert more
 377     /// precise control over the name, use braces, e.g., `${1}a`.
 378     ///
 379     /// To write a literal `$` use `$$`.
 380     ///
 381     /// # Examples
 382     ///
 383     /// Note that this function is polymorphic with respect to the replacement.
 384     /// In typical usage, this can just be a normal byte string:
 385     ///
 386     /// ```rust
 387     /// # extern crate regex; use regex::bytes::Regex;
 388     /// # fn main() {
 389     /// let re = Regex::new("[^01]+").unwrap();
 390     /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]);
 391     /// # }
 392     /// ```
 393     ///
 394     /// But anything satisfying the `Replacer` trait will work. For example, a
 395     /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the
 396     /// captures corresponding to a match. This allows one to access capturing
 397     /// group matches easily:
 398     ///
 399     /// ```rust
 400     /// # extern crate regex; use regex::bytes::Regex;
 401     /// # use regex::bytes::Captures; fn main() {
 402     /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
 403     /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
 404     ///     let mut replacement = caps[2].to_owned();
 405     ///     replacement.push(b' ');
 406     ///     replacement.extend(&caps[1]);
 407     ///     replacement
 408     /// });
 409     /// assert_eq!(result, &b"Bruce Springsteen"[..]);
 410     /// # }
 411     /// ```
 412     ///
 413     /// But this is a bit cumbersome to use all the time. Instead, a simple
 414     /// syntax is supported that expands `$name` into the corresponding capture
 415     /// group. Here's the last example, but using this expansion technique
 416     /// with named capture groups:
 417     ///
 418     /// ```rust
 419     /// # extern crate regex; use regex::bytes::Regex;
 420     /// # fn main() {
 421     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
 422     /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]);
 423     /// assert_eq!(result, &b"Bruce Springsteen"[..]);
 424     /// # }
 425     /// ```
 426     ///
 427     /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
 428     /// would produce the same result. To write a literal `$` use `$$`.
 429     ///
 430     /// Sometimes the replacement string requires use of curly braces to
 431     /// delineate a capture group replacement and surrounding literal text.
 432     /// For example, if we wanted to join two words together with an
 433     /// underscore:
 434     ///
 435     /// ```rust
 436     /// # extern crate regex; use regex::bytes::Regex;
 437     /// # fn main() {
 438     /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
 439     /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]);
 440     /// assert_eq!(result, &b"deep_fried"[..]);
 441     /// # }
 442     /// ```
 443     ///
 444     /// Without the curly braces, the capture group name `first_` would be
 445     /// used, and since it doesn't exist, it would be replaced with the empty
 446     /// string.
 447     ///
 448     /// Finally, sometimes you just want to replace a literal string with no
 449     /// regard for capturing group expansion. This can be done by wrapping a
 450     /// byte string with `NoExpand`:
 451     ///
 452     /// ```rust
 453     /// # extern crate regex; use regex::bytes::Regex;
 454     /// # fn main() {
 455     /// use regex::bytes::NoExpand;
 456     ///
 457     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
 458     /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last"));
 459     /// assert_eq!(result, &b"$2 $last"[..]);
 460     /// # }
 461     /// ```
 462     pub fn replace<'t, R: Replacer>(
 463         &self,
 464         text: &'t [u8],
 465         rep: R,
 466     ) -> Cow<'t, [u8]> {
 467         self.replacen(text, 1, rep)
 468     }
 469
 470     /// Replaces all non-overlapping matches in `text` with the replacement
 471     /// provided. This is the same as calling `replacen` with `limit` set to
 472     /// `0`.
 473     ///
 474     /// See the documentation for `replace` for details on how to access
 475     /// capturing group matches in the replacement text.
 476     pub fn replace_all<'t, R: Replacer>(
 477         &self,
 478         text: &'t [u8],
 479         rep: R,
 480     ) -> Cow<'t, [u8]> {
 481         self.replacen(text, 0, rep)
 482     }
 483
 484     /// Replaces at most `limit` non-overlapping matches in `text` with the
 485     /// replacement provided. If `limit` is 0, then all non-overlapping matches
 486     /// are replaced.
 487     ///
 488     /// See the documentation for `replace` for details on how to access
 489     /// capturing group matches in the replacement text.
 490     pub fn replacen<'t, R: Replacer>(
 491         &self,
 492         text: &'t [u8],
 493         limit: usize,
 494         mut rep: R,
 495     ) -> Cow<'t, [u8]> {
 496         if let Some(rep) = rep.no_expansion() {
 497             let mut it = self.find_iter(text).enumerate().peekable();
 498             if it.peek().is_none() {
 499                 return Cow::Borrowed(text);
 500             }
 501             let mut new = Vec::with_capacity(text.len());
 502             let mut last_match = 0;
 503             for (i, m) in it {
 504                 if limit > 0 && i >= limit {
 505                     break
 506                 }
 507                 new.extend_from_slice(&text[last_match..m.start()]);
 508                 new.extend_from_slice(&rep);
 509                 last_match = m.end();
 510             }
 511             new.extend_from_slice(&text[last_match..]);
 512             return Cow::Owned(new);
 513         }
 514
 515         // The slower path, which we use if the replacement needs access to
 516         // capture groups.
 517         let mut it = self.captures_iter(text).enumerate().peekable();
 518         if it.peek().is_none() {
 519             return Cow::Borrowed(text);
 520         }
 521         let mut new = Vec::with_capacity(text.len());
 522         let mut last_match = 0;
 523         for (i, cap) in it {
 524             if limit > 0 && i >= limit {
 525                 break
 526             }
 527             // unwrap on 0 is OK because captures only reports matches
 528             let m = cap.get(0).unwrap();
 529             new.extend_from_slice(&text[last_match..m.start()]);
 530             rep.replace_append(&cap, &mut new);
 531             last_match = m.end();
 532         }
 533         new.extend_from_slice(&text[last_match..]);
 534         Cow::Owned(new)
 535     }
 536 }
 537
 538 /// Advanced or "lower level" search methods.
 539 impl Regex {
 540     /// Returns the end location of a match in the text given.
 541     ///
 542     /// This method may have the same performance characteristics as
 543     /// `is_match`, except it provides an end location for a match. In
 544     /// particular, the location returned *may be shorter* than the proper end
 545     /// of the leftmost-first match.
 546     ///
 547     /// # Example
 548     ///
 549     /// Typically, `a+` would match the entire first sequence of `a` in some
 550     /// text, but `shortest_match` can give up as soon as it sees the first
 551     /// `a`.
 552     ///
 553     /// ```rust
 554     /// # extern crate regex; use regex::bytes::Regex;
 555     /// # fn main() {
 556     /// let text = b"aaaaa";
 557     /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
 558     /// assert_eq!(pos, Some(1));
 559     /// # }
 560     /// ```
 561     pub fn shortest_match(&self, text: &[u8]) -> Option<usize> {
 562         self.shortest_match_at(text, 0)
 563     }
 564
 565     /// Returns the same as shortest_match, but starts the search at the given
 566     /// offset.
 567     ///
 568     /// The significance of the starting point is that it takes the surrounding
 569     /// context into consideration. For example, the `\A` anchor can only
 570     /// match when `start == 0`.
 571     #[doc(hidden)]
 572     pub fn shortest_match_at(
 573         &self,
 574         text: &[u8],
 575         start: usize,
 576     ) -> Option<usize> {
 577         self.0.searcher().shortest_match_at(text, start)
 578     }
 579
 580     /// Returns the same as is_match, but starts the search at the given
 581     /// offset.
 582     ///
 583     /// The significance of the starting point is that it takes the surrounding
 584     /// context into consideration. For example, the `\A` anchor can only
 585     /// match when `start == 0`.
 586     #[doc(hidden)]
 587     pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
 588         self.shortest_match_at(text, start).is_some()
 589     }
 590
 591     /// Returns the same as find, but starts the search at the given
 592     /// offset.
 593     ///
 594     /// The significance of the starting point is that it takes the surrounding
 595     /// context into consideration. For example, the `\A` anchor can only
 596     /// match when `start == 0`.
 597     #[doc(hidden)]
 598     pub fn find_at<'t>(
 599         &self,
 600         text: &'t [u8],
 601         start: usize,
 602     ) -> Option<Match<'t>> {
 603         self.0.searcher().find_at(text, start)
 604             .map(|(s, e)| Match::new(text, s, e))
 605     }
 606
 607     /// Returns the same as captures, but starts the search at the given
 608     /// offset and populates the capture locations given.
 609     ///
 610     /// The significance of the starting point is that it takes the surrounding
 611     /// context into consideration. For example, the `\A` anchor can only
 612     /// match when `start == 0`.
 613     #[doc(hidden)]
 614     pub fn read_captures_at<'t>(
 615         &self,
 616         locs: &mut Locations,
 617         text: &'t [u8],
 618         start: usize,
 619     ) -> Option<Match<'t>> {
 620         self.0.searcher().read_captures_at(locs, text, start)
 621             .map(|(s, e)| Match::new(text, s, e))
 622     }
 623 }
 624
 625 /// Auxiliary methods.
 626 impl Regex {
 627     /// Returns the original string of this regex.
 628     pub fn as_str(&self) -> &str {
 629         &self.0.regex_strings()[0]
 630     }
 631
 632     /// Returns an iterator over the capture names.
 633     pub fn capture_names(&self) -> CaptureNames {
 634         CaptureNames(self.0.capture_names().iter())
 635     }
 636
 637     /// Returns the number of captures.
 638     pub fn captures_len(&self) -> usize {
 639         self.0.capture_names().len()
 640     }
 641
 642     /// Returns an empty set of locations that can be reused in multiple calls
 643     /// to `read_captures`.
 644     #[doc(hidden)]
 645     pub fn locations(&self) -> Locations {
 646         self.0.searcher().locations()
 647     }
 648 }
 649
 650 /// An iterator over all non-overlapping matches for a particular string.
 651 ///
 652 /// The iterator yields a tuple of integers corresponding to the start and end
 653 /// of the match. The indices are byte offsets. The iterator stops when no more
 654 /// matches can be found.
 655 ///
 656 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 657 /// lifetime of the matched byte string.
 658 pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>);
 659
 660 impl<'r, 't> Iterator for Matches<'r, 't> {
 661     type Item = Match<'t>;
 662
 663     fn next(&mut self) -> Option<Match<'t>> {
 664         let text = self.0.text();
 665         self.0.next().map(|(s, e)| Match::new(text, s, e))
 666     }
 667 }
 668
 669 /// An iterator that yields all non-overlapping capture groups matching a
 670 /// particular regular expression.
 671 ///
 672 /// The iterator stops when no more matches can be found.
 673 ///
 674 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 675 /// lifetime of the matched byte string.
 676 pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSync<'r>>);
 677
 678 impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
 679     type Item = Captures<'t>;
 680
 681     fn next(&mut self) -> Option<Captures<'t>> {
 682         self.0.next().map(|locs| Captures {
 683             text: self.0.text(),
 684             locs: locs,
 685             named_groups: self.0.regex().capture_name_idx().clone(),
 686         })
 687     }
 688 }
 689
 690 /// Yields all substrings delimited by a regular expression match.
 691 ///
 692 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 693 /// lifetime of the byte string being split.
 694 pub struct Split<'r, 't> {
 695     finder: Matches<'r, 't>,
 696     last: usize,
 697 }
 698
 699 impl<'r, 't> Iterator for Split<'r, 't> {
 700     type Item = &'t [u8];
 701
 702     fn next(&mut self) -> Option<&'t [u8]> {
 703         let text = self.finder.0.text();
 704         match self.finder.next() {
 705             None => {
 706                 if self.last >= text.len() {
 707                     None
 708                 } else {
 709                     let s = &text[self.last..];
 710                     self.last = text.len();
 711                     Some(s)
 712                 }
 713             }
 714             Some(m) => {
 715                 let matched = &text[self.last..m.start()];
 716                 self.last = m.end();
 717                 Some(matched)
 718             }
 719         }
 720     }
 721 }
 722
 723 /// Yields at most `N` substrings delimited by a regular expression match.
 724 ///
 725 /// The last substring will be whatever remains after splitting.
 726 ///
 727 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 728 /// lifetime of the byte string being split.
 729 pub struct SplitN<'r, 't> {
 730     splits: Split<'r, 't>,
 731     n: usize,
 732 }
 733
 734 impl<'r, 't> Iterator for SplitN<'r, 't> {
 735     type Item = &'t [u8];
 736
 737     fn next(&mut self) -> Option<&'t [u8]> {
 738         if self.n == 0 {
 739             return None
 740         }
 741         self.n -= 1;
 742         if self.n == 0 {
 743             let text = self.splits.finder.0.text();
 744             Some(&text[self.splits.last..])
 745         } else {
 746             self.splits.next()
 747         }
 748     }
 749 }
 750
 751 /// An iterator over the names of all possible captures.
 752 ///
 753 /// `None` indicates an unnamed capture; the first element (capture 0, the
 754 /// whole matched region) is always unnamed.
 755 ///
 756 /// `'r` is the lifetime of the compiled regular expression.
 757 pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
 758
 759 impl<'r> Iterator for CaptureNames<'r> {
 760     type Item = Option<&'r str>;
 761
 762     fn next(&mut self) -> Option<Option<&'r str>> {
 763         self.0.next().as_ref()
 764             .map(|slot| slot.as_ref().map(|name| name.as_ref()))
 765     }
 766
 767     fn size_hint(&self) -> (usize, Option<usize>) {
 768         self.0.size_hint()
 769     }
 770 }
 771
 772 /// Captures represents a group of captured byte strings for a single match.
 773 ///
 774 /// The 0th capture always corresponds to the entire match. Each subsequent
 775 /// index corresponds to the next capture group in the regex. If a capture
 776 /// group is named, then the matched byte string is *also* available via the
 777 /// `name` method. (Note that the 0th capture is always unnamed and so must be
 778 /// accessed with the `get` method.)
 779 ///
 780 /// Positions returned from a capture group are always byte indices.
 781 ///
 782 /// `'t` is the lifetime of the matched text.
 783 pub struct Captures<'t> {
 784     text: &'t [u8],
 785     locs: Locations,
 786     named_groups: Arc<HashMap<String, usize>>,
 787 }
 788
 789 impl<'t> Captures<'t> {
 790     /// Returns the match associated with the capture group at index `i`. If
 791     /// `i` does not correspond to a capture group, or if the capture group
 792     /// did not participate in the match, then `None` is returned.
 793     ///
 794     /// # Examples
 795     ///
 796     /// Get the text of the match with a default of an empty string if this
 797     /// group didn't participate in the match:
 798     ///
 799     /// ```rust
 800     /// # use regex::bytes::Regex;
 801     /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
 802     /// let caps = re.captures(b"abc123").unwrap();
 803     ///
 804     /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes());
 805     /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes());
 806     /// assert_eq!(text1, &b"123"[..]);
 807     /// assert_eq!(text2, &b""[..]);
 808     /// ```
 809     pub fn get(&self, i: usize) -> Option<Match<'t>> {
 810         self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
 811     }
 812
 813     /// Returns the match for the capture group named `name`. If `name` isn't a
 814     /// valid capture group or didn't match anything, then `None` is returned.
 815     pub fn name(&self, name: &str) -> Option<Match<'t>> {
 816         self.named_groups.get(name).and_then(|&i| self.get(i))
 817     }
 818
 819     /// An iterator that yields all capturing matches in the order in which
 820     /// they appear in the regex. If a particular capture group didn't
 821     /// participate in the match, then `None` is yielded for that capture.
 822     ///
 823     /// The first match always corresponds to the overall match of the regex.
 824     pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
 825         SubCaptureMatches {
 826             caps: self,
 827             it: self.locs.iter(),
 828         }
 829     }
 830
 831     /// Expands all instances of `$name` in `replacement` to the corresponding
 832     /// capture group `name`, and writes them to the `dst` buffer given.
 833     ///
 834     /// `name` may be an integer corresponding to the index of the
 835     /// capture group (counted by order of opening parenthesis where `0` is the
 836     /// entire match) or it can be a name (consisting of letters, digits or
 837     /// underscores) corresponding to a named capture group.
 838     ///
 839     /// If `name` isn't a valid capture group (whether the name doesn't exist
 840     /// or isn't a valid index), then it is replaced with the empty string.
 841     ///
 842     /// The longest possible name is used. e.g., `$1a` looks up the capture
 843     /// group named `1a` and not the capture group at index `1`. To exert more
 844     /// precise control over the name, use braces, e.g., `${1}a`.
 845     ///
 846     /// To write a literal `$` use `$$`.
 847     pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
 848         expand_bytes(self, replacement, dst)
 849     }
 850
 851     /// Returns the number of captured groups.
 852     ///
 853     /// This is always at least `1`, since every regex has at least one capture
 854     /// group that corresponds to the full match.
 855     #[inline]
 856     pub fn len(&self) -> usize {
 857         self.locs.len()
 858     }
 859 }
 860
 861 impl<'t> fmt::Debug for Captures<'t> {
 862     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 863         f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
 864     }
 865 }
 866
 867 struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
 868
 869 impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
 870     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 871         fn escape_bytes(bytes: &[u8]) -> String {
 872             let mut s = String::new();
 873             for &b in bytes {
 874                 s.push_str(&escape_byte(b));
 875             }
 876             s
 877         }
 878
 879         fn escape_byte(byte: u8) -> String {
 880             use std::ascii::escape_default;
 881
 882             let escaped: Vec<u8> = escape_default(byte).collect();
 883             String::from_utf8_lossy(&escaped).into_owned()
 884         }
 885
 886         // We'd like to show something nice here, even if it means an
 887         // allocation to build a reverse index.
 888         let slot_to_name: HashMap<&usize, &String> =
 889             self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
 890         let mut map = f.debug_map();
 891         for (slot, m) in self.0.locs.iter().enumerate() {
 892             let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e]));
 893             if let Some(name) = slot_to_name.get(&slot) {
 894                 map.entry(&name, &m);
 895             } else {
 896                 map.entry(&slot, &m);
 897             }
 898         }
 899         map.finish()
 900     }
 901 }
 902
 903 /// Get a group by index.
 904 ///
 905 /// `'t` is the lifetime of the matched text.
 906 ///
 907 /// The text can't outlive the `Captures` object if this method is
 908 /// used, because of how `Index` is defined (normally `a[i]` is part
 909 /// of `a` and can't outlive it); to do that, use `get()` instead.
 910 ///
 911 /// # Panics
 912 ///
 913 /// If there is no group at the given index.
 914 impl<'t> Index<usize> for Captures<'t> {
 915     type Output = [u8];
 916
 917     fn index(&self, i: usize) -> &[u8] {
 918         self.get(i).map(|m| m.as_bytes())
 919             .unwrap_or_else(|| panic!("no group at index '{}'", i))
 920     }
 921 }
 922
 923 /// Get a group by name.
 924 ///
 925 /// `'t` is the lifetime of the matched text and `'i` is the lifetime
 926 /// of the group name (the index).
 927 ///
 928 /// The text can't outlive the `Captures` object if this method is
 929 /// used, because of how `Index` is defined (normally `a[i]` is part
 930 /// of `a` and can't outlive it); to do that, use `name` instead.
 931 ///
 932 /// # Panics
 933 ///
 934 /// If there is no group named by the given value.
 935 impl<'t, 'i> Index<&'i str> for Captures<'t> {
 936     type Output = [u8];
 937
 938     fn index<'a>(&'a self, name: &'i str) -> &'a [u8] {
 939         self.name(name).map(|m| m.as_bytes())
 940             .unwrap_or_else(|| panic!("no group named '{}'", name))
 941     }
 942 }
 943
 944 /// An iterator that yields all capturing matches in the order in which they
 945 /// appear in the regex.
 946 ///
 947 /// If a particular capture group didn't participate in the match, then `None`
 948 /// is yielded for that capture. The first match always corresponds to the
 949 /// overall match of the regex.
 950 ///
 951 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
 952 /// the lifetime `'t` corresponds to the originally matched text.
 953 pub struct SubCaptureMatches<'c, 't: 'c> {
 954     caps: &'c Captures<'t>,
 955     it: SubCapturesPosIter<'c>,
 956 }
 957
 958 impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
 959     type Item = Option<Match<'t>>;
 960
 961     fn next(&mut self) -> Option<Option<Match<'t>>> {
 962         self.it.next()
 963             .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
 964     }
 965 }
 966
 967 /// Replacer describes types that can be used to replace matches in a byte
 968 /// string.
 969 ///
 970 /// In general, users of this crate shouldn't need to implement this trait,
 971 /// since implementations are already provided for `&[u8]` and
 972 /// `FnMut(&Captures) -> Vec<u8>`, which covers most use cases.
 973 pub trait Replacer {
 974     /// Appends text to `dst` to replace the current match.
 975     ///
 976     /// The current match is represented by `caps`, which is guaranteed to
 977     /// have a match at capture group `0`.
 978     ///
 979     /// For example, a no-op replacement would be
 980     /// `dst.extend(&caps[0])`.
 981     fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>);
 982
 983     /// Return a fixed unchanging replacement byte string.
 984     ///
 985     /// When doing replacements, if access to `Captures` is not needed (e.g.,
 986     /// the replacement byte string does not need `$` expansion), then it can
 987     /// be beneficial to avoid finding sub-captures.
 988     ///
 989     /// In general, this is called once for every call to `replacen`.
 990     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
 991         None
 992     }
 993
 994     /// Return a `Replacer` that borrows and wraps this `Replacer`.
 995     ///
 996     /// This is useful when you want to take a generic `Replacer` (which might
 997     /// not be cloneable) and use it without consuming it, so it can be used
 998     /// more than once.
 999     ///
1000     /// # Example
1001     ///
1002     /// ```
1003     /// use regex::bytes::{Regex, Replacer};
1004     ///
1005     /// fn replace_all_twice<R: Replacer>(
1006     ///     re: Regex,
1007     ///     src: &[u8],
1008     ///     mut rep: R,
1009     /// ) -> Vec<u8> {
1010     ///     let dst = re.replace_all(src, rep.by_ref());
1011     ///     let dst = re.replace_all(&dst, rep.by_ref());
1012     ///     dst.into_owned()
1013     /// }
1014     /// ```
1015     fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
1016         ReplacerRef(self)
1017     }
1018 }
1019
1020 /// By-reference adaptor for a `Replacer`
1021 ///
1022 /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
1023 #[derive(Debug)]
1024 pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R);
1025
1026 impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
1027     fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
1028         self.0.replace_append(caps, dst)
1029     }
1030     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
1031         self.0.no_expansion()
1032     }
1033 }
1034
1035 impl<'a> Replacer for &'a [u8] {
1036     fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
1037         caps.expand(*self, dst);
1038     }
1039
1040     fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
1041         match memchr(b'$', *self) {
1042             Some(_) => None,
1043             None => Some(Cow::Borrowed(*self)),
1044         }
1045     }
1046 }
1047
1048 impl<F> Replacer for F where F: FnMut(&Captures) -> Vec<u8> {
1049     fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
1050         dst.extend_from_slice(&(*self)(caps));
1051     }
1052 }
1053
1054 /// `NoExpand` indicates literal byte string replacement.
1055 ///
1056 /// It can be used with `replace` and `replace_all` to do a literal byte string
1057 /// replacement without expanding `$name` to their corresponding capture
1058 /// groups. This can be both convenient (to avoid escaping `$`, for example)
1059 /// and performant (since capture groups don't need to be found).
1060 ///
1061 /// `'t` is the lifetime of the literal text.
1062 pub struct NoExpand<'t>(pub &'t [u8]);
1063
1064 impl<'t> Replacer for NoExpand<'t> {
1065     fn replace_append(&mut self, _: &Captures, dst: &mut Vec<u8>) {
1066         dst.extend_from_slice(self.0);
1067     }
1068
1069     fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
1070         Some(Cow::Borrowed(self.0))
1071     }
1072 }