src/vendor/regex/src/re_unicode.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::borrow::Cow;
  12 use std::collections::HashMap;
  13 use std::fmt;
  14 use std::ops::Index;
  15 use std::str::FromStr;
  16 use std::sync::Arc;
  17
  18 use memchr::memchr;
  19 use syntax;
  20
  21 use error::Error;
  22 use exec::{Exec, ExecNoSyncStr};
  23 use expand::expand_str;
  24 use re_builder::unicode::RegexBuilder;
  25 use re_plugin::Plugin;
  26 use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
  27
  28 /// Escapes all regular expression meta characters in `text`.
  29 ///
  30 /// The string returned may be safely used as a literal in a regular
  31 /// expression.
  32 pub fn escape(text: &str) -> String {
  33     syntax::escape(text)
  34 }
  35
  36 /// Match represents a single match of a regex in a haystack.
  37 ///
  38 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
  39 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
  40 pub struct Match<'t> {
  41     text: &'t str,
  42     start: usize,
  43     end: usize,
  44 }
  45
  46 impl<'t> Match<'t> {
  47     /// Returns the starting byte offset of the match in the haystack.
  48     #[inline]
  49     pub fn start(&self) -> usize {
  50         self.start
  51     }
  52
  53     /// Returns the ending byte offset of the match in the haystack.
  54     #[inline]
  55     pub fn end(&self) -> usize {
  56         self.end
  57     }
  58
  59     /// Returns the matched text.
  60     #[inline]
  61     pub fn as_str(&self) -> &'t str {
  62         &self.text[self.start..self.end]
  63     }
  64
  65     /// Creates a new match from the given haystack and byte offsets.
  66     #[inline]
  67     fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
  68         Match {
  69             text: haystack,
  70             start: start,
  71             end: end,
  72         }
  73     }
  74 }
  75
  76 /// A compiled regular expression for matching Unicode strings.
  77 ///
  78 /// It is represented as either a sequence of bytecode instructions (dynamic)
  79 /// or as a specialized Rust function (native). It can be used to search, split
  80 /// or replace text. All searching is done with an implicit `.*?` at the
  81 /// beginning and end of an expression. To force an expression to match the
  82 /// whole string (or a prefix or a suffix), you must use an anchor like `^` or
  83 /// `$` (or `\A` and `\z`).
  84 ///
  85 /// While this crate will handle Unicode strings (whether in the regular
  86 /// expression or in the search text), all positions returned are **byte
  87 /// indices**. Every byte index is guaranteed to be at a Unicode code point
  88 /// boundary.
  89 ///
  90 /// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
  91 /// compiled regular expression and text to search, respectively.
  92 ///
  93 /// The only methods that allocate new strings are the string replacement
  94 /// methods. All other methods (searching and splitting) return borrowed
  95 /// pointers into the string given.
  96 ///
  97 /// # Examples
  98 ///
  99 /// Find the location of a US phone number:
 100 ///
 101 /// ```rust
 102 /// # use regex::Regex;
 103 /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
 104 /// let mat = re.find("phone: 111-222-3333").unwrap();
 105 /// assert_eq!((mat.start(), mat.end()), (7, 19));
 106 /// ```
 107 ///
 108 /// # Using the `std::str::pattern` methods with `Regex`
 109 ///
 110 /// > **Note**: This section requires that this crate is compiled with the
 111 /// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
 112 ///
 113 /// Since `Regex` implements `Pattern`, you can use regexes with methods
 114 /// defined on `&str`. For example, `is_match`, `find`, `find_iter`
 115 /// and `split` can be replaced with `str::contains`, `str::find`,
 116 /// `str::match_indices` and `str::split`.
 117 ///
 118 /// Here are some examples:
 119 ///
 120 /// ```rust,ignore
 121 /// # use regex::Regex;
 122 /// let re = Regex::new(r"\d+").unwrap();
 123 /// let haystack = "a111b222c";
 124 ///
 125 /// assert!(haystack.contains(&re));
 126 /// assert_eq!(haystack.find(&re), Some(1));
 127 /// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
 128 ///            vec![(1, 4), (5, 8)]);
 129 /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
 130 /// ```
 131 #[derive(Clone)]
 132 pub struct Regex(#[doc(hidden)] pub _Regex);
 133
 134 #[derive(Clone)]
 135 #[doc(hidden)]
 136 pub enum _Regex {
 137     // The representation of `Regex` is exported to support the `regex!`
 138     // syntax extension. Do not rely on it.
 139     //
 140     // See the comments for the `internal` module in `lib.rs` for a more
 141     // detailed explanation for what `regex!` requires.
 142     #[doc(hidden)]
 143     Dynamic(Exec),
 144     #[doc(hidden)]
 145     Plugin(Plugin),
 146 }
 147
 148 impl fmt::Display for Regex {
 149     /// Shows the original regular expression.
 150     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 151         write!(f, "{}", self.as_str())
 152     }
 153 }
 154
 155 impl fmt::Debug for Regex {
 156     /// Shows the original regular expression.
 157     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 158         fmt::Display::fmt(self, f)
 159     }
 160 }
 161
 162 #[doc(hidden)]
 163 impl From<Exec> for Regex {
 164     fn from(exec: Exec) -> Regex {
 165         Regex(_Regex::Dynamic(exec))
 166     }
 167 }
 168
 169 impl FromStr for Regex {
 170     type Err = Error;
 171
 172     /// Attempts to parse a string into a regular expression
 173     fn from_str(s: &str) -> Result<Regex, Error> {
 174         Regex::new(s)
 175     }
 176 }
 177
 178 /// Core regular expression methods.
 179 impl Regex {
 180     /// Compiles a regular expression. Once compiled, it can be used repeatedly
 181     /// to search, split or replace text in a string.
 182     ///
 183     /// If an invalid expression is given, then an error is returned.
 184     pub fn new(re: &str) -> Result<Regex, Error> {
 185         RegexBuilder::new(re).build()
 186     }
 187
 188     /// Returns true if and only if the regex matches the string given.
 189     ///
 190     /// It is recommended to use this method if all you need to do is test
 191     /// a match, since the underlying matching engine may be able to do less
 192     /// work.
 193     ///
 194     /// # Example
 195     ///
 196     /// Test if some text contains at least one word with exactly 13
 197     /// Unicode word characters:
 198     ///
 199     /// ```rust
 200     /// # extern crate regex; use regex::Regex;
 201     /// # fn main() {
 202     /// let text = "I categorically deny having triskaidekaphobia.";
 203     /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
 204     /// # }
 205     /// ```
 206     pub fn is_match(&self, text: &str) -> bool {
 207         self.is_match_at(text, 0)
 208     }
 209
 210     /// Returns the start and end byte range of the leftmost-first match in
 211     /// `text`. If no match exists, then `None` is returned.
 212     ///
 213     /// Note that this should only be used if you want to discover the position
 214     /// of the match. Testing the existence of a match is faster if you use
 215     /// `is_match`.
 216     ///
 217     /// # Example
 218     ///
 219     /// Find the start and end location of the first word with exactly 13
 220     /// Unicode word characters:
 221     ///
 222     /// ```rust
 223     /// # extern crate regex; use regex::Regex;
 224     /// # fn main() {
 225     /// let text = "I categorically deny having triskaidekaphobia.";
 226     /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
 227     /// assert_eq!(mat.start(), 2);
 228     /// assert_eq!(mat.end(), 15);
 229     /// # }
 230     /// ```
 231     pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
 232         self.find_at(text, 0)
 233     }
 234
 235     /// Returns an iterator for each successive non-overlapping match in
 236     /// `text`, returning the start and end byte indices with respect to
 237     /// `text`.
 238     ///
 239     /// # Example
 240     ///
 241     /// Find the start and end location of every word with exactly 13 Unicode
 242     /// word characters:
 243     ///
 244     /// ```rust
 245     /// # extern crate regex; use regex::Regex;
 246     /// # fn main() {
 247     /// let text = "Retroactively relinquishing remunerations is reprehensible.";
 248     /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
 249     ///     println!("{:?}", mat);
 250     /// }
 251     /// # }
 252     /// ```
 253     pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
 254         match self.0 {
 255             _Regex::Dynamic(ref exec) => {
 256                 let it = exec.searcher_str().find_iter(text);
 257                 Matches(MatchesInner::Dynamic(it))
 258             }
 259             _Regex::Plugin(ref plug) => {
 260                 let it = plug.find_iter(text);
 261                 Matches(MatchesInner::Plugin(it))
 262             }
 263         }
 264     }
 265
 266     /// Returns the capture groups corresponding to the leftmost-first
 267     /// match in `text`. Capture group `0` always corresponds to the entire
 268     /// match. If no match is found, then `None` is returned.
 269     ///
 270     /// You should only use `captures` if you need access to the location of
 271     /// capturing group matches. Otherwise, `find` is faster for discovering
 272     /// the location of the overall match.
 273     ///
 274     /// # Examples
 275     ///
 276     /// Say you have some text with movie names and their release years,
 277     /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
 278     /// looking like that, while also extracting the movie name and its release
 279     /// year separately.
 280     ///
 281     /// ```rust
 282     /// # extern crate regex; use regex::Regex;
 283     /// # fn main() {
 284     /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
 285     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 286     /// let caps = re.captures(text).unwrap();
 287     /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
 288     /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
 289     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 290     /// // You can also access the groups by index using the Index notation.
 291     /// // Note that this will panic on an invalid index.
 292     /// assert_eq!(&caps[1], "Citizen Kane");
 293     /// assert_eq!(&caps[2], "1941");
 294     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 295     /// # }
 296     /// ```
 297     ///
 298     /// Note that the full match is at capture group `0`. Each subsequent
 299     /// capture group is indexed by the order of its opening `(`.
 300     ///
 301     /// We can make this example a bit clearer by using *named* capture groups:
 302     ///
 303     /// ```rust
 304     /// # extern crate regex; use regex::Regex;
 305     /// # fn main() {
 306     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 307     ///                .unwrap();
 308     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 309     /// let caps = re.captures(text).unwrap();
 310     /// assert_eq!(&caps["title"], "Citizen Kane");
 311     /// assert_eq!(&caps["year"], "1941");
 312     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 313     /// // You can also access the groups by name using the Index notation.
 314     /// // Note that this will panic on an invalid group name.
 315     /// assert_eq!(&caps["title"], "Citizen Kane");
 316     /// assert_eq!(&caps["year"], "1941");
 317     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 318     ///
 319     /// # }
 320     /// ```
 321     ///
 322     /// Here we name the capture groups, which we can access with the `name`
 323     /// method or the `Index` notation with a `&str`. Note that the named
 324     /// capture groups are still accessible with `get` or the `Index` notation
 325     /// with a `usize`.
 326     ///
 327     /// The `0`th capture group is always unnamed, so it must always be
 328     /// accessed with `at(0)` or `[0]`.
 329     pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
 330         let mut locs = self.locations();
 331         self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
 332             text: text,
 333             locs: locs,
 334             named_groups: NamedGroups::from_regex(self)
 335         })
 336     }
 337
 338     /// Returns an iterator over all the non-overlapping capture groups matched
 339     /// in `text`. This is operationally the same as `find_iter`, except it
 340     /// yields information about capturing group matches.
 341     ///
 342     /// # Example
 343     ///
 344     /// We can use this to find all movie titles and their release years in
 345     /// some text, where the movie is formatted like "'Title' (xxxx)":
 346     ///
 347     /// ```rust
 348     /// # extern crate regex; use regex::Regex;
 349     /// # fn main() {
 350     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 351     ///                .unwrap();
 352     /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
 353     /// for caps in re.captures_iter(text) {
 354     ///     println!("Movie: {:?}, Released: {:?}",
 355     ///              &caps["title"], &caps["year"]);
 356     /// }
 357     /// // Output:
 358     /// // Movie: Citizen Kane, Released: 1941
 359     /// // Movie: The Wizard of Oz, Released: 1939
 360     /// // Movie: M, Released: 1931
 361     /// # }
 362     /// ```
 363     pub fn captures_iter<'r, 't>(
 364         &'r self,
 365         text: &'t str,
 366     ) -> CaptureMatches<'r, 't> {
 367         match self.0 {
 368             _Regex::Dynamic(ref exec) => {
 369                 let it = exec.searcher_str().captures_iter(text);
 370                 CaptureMatches(CaptureMatchesInner::Dynamic(it))
 371             }
 372             _Regex::Plugin(ref plug) => {
 373                 let it = plug.captures_iter(text);
 374                 CaptureMatches(CaptureMatchesInner::Plugin(it))
 375             }
 376         }
 377     }
 378
 379     /// Returns an iterator of substrings of `text` delimited by a match of the
 380     /// regular expression. Namely, each element of the iterator corresponds to
 381     /// text that *isn't* matched by the regular expression.
 382     ///
 383     /// This method will *not* copy the text given.
 384     ///
 385     /// # Example
 386     ///
 387     /// To split a string delimited by arbitrary amounts of spaces or tabs:
 388     ///
 389     /// ```rust
 390     /// # extern crate regex; use regex::Regex;
 391     /// # fn main() {
 392     /// let re = Regex::new(r"[ \t]+").unwrap();
 393     /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
 394     /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
 395     /// # }
 396     /// ```
 397     pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
 398         Split {
 399             finder: self.find_iter(text),
 400             last: 0,
 401         }
 402     }
 403
 404     /// Returns an iterator of at most `limit` substrings of `text` delimited
 405     /// by a match of the regular expression. (A `limit` of `0` will return no
 406     /// substrings.) Namely, each element of the iterator corresponds to text
 407     /// that *isn't* matched by the regular expression. The remainder of the
 408     /// string that is not split will be the last element in the iterator.
 409     ///
 410     /// This method will *not* copy the text given.
 411     ///
 412     /// # Example
 413     ///
 414     /// Get the first two words in some text:
 415     ///
 416     /// ```rust
 417     /// # extern crate regex; use regex::Regex;
 418     /// # fn main() {
 419     /// let re = Regex::new(r"\W+").unwrap();
 420     /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
 421     /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
 422     /// # }
 423     /// ```
 424     pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize)
 425                          -> SplitN<'r, 't> {
 426         SplitN {
 427             splits: self.split(text),
 428             n: limit,
 429         }
 430     }
 431
 432     /// Replaces the leftmost-first match with the replacement provided.
 433     /// The replacement can be a regular string (where `$N` and `$name` are
 434     /// expanded to match capture groups) or a function that takes the matches'
 435     /// `Captures` and returns the replaced string.
 436     ///
 437     /// If no match is found, then a copy of the string is returned unchanged.
 438     ///
 439     /// # Replacement string syntax
 440     ///
 441     /// All instances of `$name` in the replacement text is replaced with the
 442     /// corresponding capture group `name`.
 443     ///
 444     /// `name` may be an integer corresponding to the index of the
 445     /// capture group (counted by order of opening parenthesis where `0` is the
 446     /// entire match) or it can be a name (consisting of letters, digits or
 447     /// underscores) corresponding to a named capture group.
 448     ///
 449     /// If `name` isn't a valid capture group (whether the name doesn't exist
 450     /// or isn't a valid index), then it is replaced with the empty string.
 451     ///
 452     /// The longest possible name is used. e.g., `$1a` looks up the capture
 453     /// group named `1a` and not the capture group at index `1`. To exert more
 454     /// precise control over the name, use braces, e.g., `${1}a`.
 455     ///
 456     /// To write a literal `$` use `$$`.
 457     ///
 458     /// # Examples
 459     ///
 460     /// Note that this function is polymorphic with respect to the replacement.
 461     /// In typical usage, this can just be a normal string:
 462     ///
 463     /// ```rust
 464     /// # extern crate regex; use regex::Regex;
 465     /// # fn main() {
 466     /// let re = Regex::new("[^01]+").unwrap();
 467     /// assert_eq!(re.replace("1078910", ""), "1010");
 468     /// # }
 469     /// ```
 470     ///
 471     /// But anything satisfying the `Replacer` trait will work. For example,
 472     /// a closure of type `|&Captures| -> String` provides direct access to the
 473     /// captures corresponding to a match. This allows one to access
 474     /// capturing group matches easily:
 475     ///
 476     /// ```rust
 477     /// # extern crate regex; use regex::Regex;
 478     /// # use regex::Captures; fn main() {
 479     /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
 480     /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
 481     ///     format!("{} {}", &caps[2], &caps[1])
 482     /// });
 483     /// assert_eq!(result, "Bruce Springsteen");
 484     /// # }
 485     /// ```
 486     ///
 487     /// But this is a bit cumbersome to use all the time. Instead, a simple
 488     /// syntax is supported that expands `$name` into the corresponding capture
 489     /// group. Here's the last example, but using this expansion technique
 490     /// with named capture groups:
 491     ///
 492     /// ```rust
 493     /// # extern crate regex; use regex::Regex;
 494     /// # fn main() {
 495     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
 496     /// let result = re.replace("Springsteen, Bruce", "$first $last");
 497     /// assert_eq!(result, "Bruce Springsteen");
 498     /// # }
 499     /// ```
 500     ///
 501     /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
 502     /// would produce the same result. To write a literal `$` use `$$`.
 503     ///
 504     /// Sometimes the replacement string requires use of curly braces to
 505     /// delineate a capture group replacement and surrounding literal text.
 506     /// For example, if we wanted to join two words together with an
 507     /// underscore:
 508     ///
 509     /// ```rust
 510     /// # extern crate regex; use regex::Regex;
 511     /// # fn main() {
 512     /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
 513     /// let result = re.replace("deep fried", "${first}_$second");
 514     /// assert_eq!(result, "deep_fried");
 515     /// # }
 516     /// ```
 517     ///
 518     /// Without the curly braces, the capture group name `first_` would be
 519     /// used, and since it doesn't exist, it would be replaced with the empty
 520     /// string.
 521     ///
 522     /// Finally, sometimes you just want to replace a literal string with no
 523     /// regard for capturing group expansion. This can be done by wrapping a
 524     /// byte string with `NoExpand`:
 525     ///
 526     /// ```rust
 527     /// # extern crate regex; use regex::Regex;
 528     /// # fn main() {
 529     /// use regex::NoExpand;
 530     ///
 531     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
 532     /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
 533     /// assert_eq!(result, "$2 $last");
 534     /// # }
 535     /// ```
 536     pub fn replace<'t, R: Replacer>(
 537         &self,
 538         text: &'t str,
 539         rep: R,
 540     ) -> Cow<'t, str> {
 541         self.replacen(text, 1, rep)
 542     }
 543
 544     /// Replaces all non-overlapping matches in `text` with the replacement
 545     /// provided. This is the same as calling `replacen` with `limit` set to
 546     /// `0`.
 547     ///
 548     /// See the documentation for `replace` for details on how to access
 549     /// capturing group matches in the replacement string.
 550     pub fn replace_all<'t, R: Replacer>(
 551         &self,
 552         text: &'t str,
 553         rep: R,
 554     ) -> Cow<'t, str> {
 555         self.replacen(text, 0, rep)
 556     }
 557
 558     /// Replaces at most `limit` non-overlapping matches in `text` with the
 559     /// replacement provided. If `limit` is 0, then all non-overlapping matches
 560     /// are replaced.
 561     ///
 562     /// See the documentation for `replace` for details on how to access
 563     /// capturing group matches in the replacement string.
 564     pub fn replacen<'t, R: Replacer>(
 565         &self,
 566         text: &'t str,
 567         limit: usize,
 568         mut rep: R,
 569     ) -> Cow<'t, str> {
 570         // If we know that the replacement doesn't have any capture expansions,
 571         // then we can fast path. The fast path can make a tremendous
 572         // difference:
 573         //
 574         //   1) We use `find_iter` instead of `captures_iter`. Not asking for
 575         //      captures generally makes the regex engines faster.
 576         //   2) We don't need to look up all of the capture groups and do
 577         //      replacements inside the replacement string. We just push it
 578         //      at each match and be done with it.
 579         if let Some(rep) = rep.no_expansion() {
 580             let mut new = String::with_capacity(text.len());
 581             let mut last_match = 0;
 582             for (i, m) in self.find_iter(text).enumerate() {
 583                 if limit > 0 && i >= limit {
 584                     break
 585                 }
 586                 new.push_str(&text[last_match..m.start()]);
 587                 new.push_str(&rep);
 588                 last_match = m.end();
 589             }
 590             if last_match == 0 {
 591                 return Cow::Borrowed(text);
 592             }
 593             new.push_str(&text[last_match..]);
 594             return Cow::Owned(new);
 595         }
 596
 597         // The slower path, which we use if the replacement needs access to
 598         // capture groups.
 599         let mut it = self.captures_iter(text).enumerate().peekable();
 600         if it.peek().is_none() {
 601             return Cow::Borrowed(text);
 602         }
 603         let mut new = String::with_capacity(text.len());
 604         let mut last_match = 0;
 605         for (i, cap) in it {
 606             if limit > 0 && i >= limit {
 607                 break
 608             }
 609             // unwrap on 0 is OK because captures only reports matches
 610             let m = cap.get(0).unwrap();
 611             new.push_str(&text[last_match..m.start()]);
 612             rep.replace_append(&cap, &mut new);
 613             last_match = m.end();
 614         }
 615         new.push_str(&text[last_match..]);
 616         Cow::Owned(new)
 617     }
 618 }
 619
 620 /// Advanced or "lower level" search methods.
 621 impl Regex {
 622     /// Returns the end location of a match in the text given.
 623     ///
 624     /// This method may have the same performance characteristics as
 625     /// `is_match`, except it provides an end location for a match. In
 626     /// particular, the location returned *may be shorter* than the proper end
 627     /// of the leftmost-first match.
 628     ///
 629     /// # Example
 630     ///
 631     /// Typically, `a+` would match the entire first sequence of `a` in some
 632     /// text, but `shortest_match` can give up as soon as it sees the first
 633     /// `a`.
 634     ///
 635     /// ```rust
 636     /// # extern crate regex; use regex::Regex;
 637     /// # fn main() {
 638     /// let text = "aaaaa";
 639     /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
 640     /// assert_eq!(pos, Some(1));
 641     /// # }
 642     /// ```
 643     pub fn shortest_match(&self, text: &str) -> Option<usize> {
 644         self.shortest_match_at(text, 0)
 645     }
 646
 647     /// Returns the same as shortest_match, but starts the search at the given
 648     /// offset.
 649     ///
 650     /// The significance of the starting point is that it takes the surrounding
 651     /// context into consideration. For example, the `\A` anchor can only
 652     /// match when `start == 0`.
 653     #[doc(hidden)]
 654     pub fn shortest_match_at(
 655         &self,
 656         text: &str,
 657         start: usize,
 658     ) -> Option<usize> {
 659         match self.0 {
 660             _Regex::Dynamic(ref exec) => {
 661                 exec.searcher_str().shortest_match_at(text, start)
 662             }
 663             _Regex::Plugin(ref plug) => plug.shortest_match_at(text, start),
 664         }
 665     }
 666
 667     /// Returns the same as is_match, but starts the search at the given
 668     /// offset.
 669     ///
 670     /// The significance of the starting point is that it takes the surrounding
 671     /// context into consideration. For example, the `\A` anchor can only
 672     /// match when `start == 0`.
 673     #[doc(hidden)]
 674     pub fn is_match_at(&self, text: &str, start: usize) -> bool {
 675         self.shortest_match_at(text, start).is_some()
 676     }
 677
 678     /// Returns the same as find, but starts the search at the given
 679     /// offset.
 680     ///
 681     /// The significance of the starting point is that it takes the surrounding
 682     /// context into consideration. For example, the `\A` anchor can only
 683     /// match when `start == 0`.
 684     #[doc(hidden)]
 685     pub fn find_at<'t>(
 686         &self,
 687         text: &'t str,
 688         start: usize,
 689     ) -> Option<Match<'t>> {
 690         match self.0 {
 691             _Regex::Dynamic(ref exec) => {
 692                 exec.searcher_str().find_at(text, start).map(|(s, e)| {
 693                     Match::new(text, s, e)
 694                 })
 695             }
 696             _Regex::Plugin(ref plug) => {
 697                 plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e))
 698             }
 699         }
 700     }
 701
 702     /// Returns the same as captures, but starts the search at the given
 703     /// offset and populates the capture locations given.
 704     ///
 705     /// The significance of the starting point is that it takes the surrounding
 706     /// context into consideration. For example, the `\A` anchor can only
 707     /// match when `start == 0`.
 708     #[doc(hidden)]
 709     pub fn read_captures_at<'t>(
 710         &self,
 711         locs: &mut Locations,
 712         text: &'t str,
 713         start: usize,
 714     ) -> Option<Match<'t>> {
 715         match self.0 {
 716             _Regex::Dynamic(ref exec) => {
 717                 exec.searcher_str().read_captures_at(locs, text, start)
 718                     .map(|(s, e)| Match::new(text, s, e))
 719             }
 720             _Regex::Plugin(ref plug) => {
 721                 plug.read_captures_at(locs, text, start)
 722                     .map(|(s, e)| Match::new(text, s, e))
 723             }
 724         }
 725     }
 726 }
 727
 728 /// Auxiliary methods.
 729 impl Regex {
 730     /// Returns the original string of this regex.
 731     pub fn as_str(&self) -> &str {
 732         match self.0 {
 733             _Regex::Dynamic(ref exec) => &exec.regex_strings()[0],
 734             _Regex::Plugin(ref plug) => &plug.original,
 735         }
 736     }
 737
 738     /// Returns an iterator over the capture names.
 739     pub fn capture_names(&self) -> CaptureNames {
 740         CaptureNames(match self.0 {
 741             _Regex::Plugin(ref n) => _CaptureNames::Plugin(n.names.iter()),
 742             _Regex::Dynamic(ref d) => {
 743                 _CaptureNames::Dynamic(d.capture_names().iter())
 744             }
 745         })
 746     }
 747
 748     /// Returns the number of captures.
 749     pub fn captures_len(&self) -> usize {
 750         match self.0 {
 751             _Regex::Plugin(ref n) => n.names.len(),
 752             _Regex::Dynamic(ref d) => d.capture_names().len()
 753         }
 754     }
 755
 756     /// Returns an empty set of locations that can be reused in multiple calls
 757     /// to `read_captures`.
 758     #[doc(hidden)]
 759     pub fn locations(&self) -> Locations {
 760         match self.0 {
 761             _Regex::Dynamic(ref exec) => {
 762                 exec.searcher_str().locations()
 763             }
 764             _Regex::Plugin(ref plug) => plug.locations(),
 765         }
 766     }
 767 }
 768
 769 /// An iterator over the names of all possible captures.
 770 ///
 771 /// `None` indicates an unnamed capture; the first element (capture 0, the
 772 /// whole matched region) is always unnamed.
 773 ///
 774 /// `'r` is the lifetime of the compiled regular expression.
 775 pub struct CaptureNames<'r>(_CaptureNames<'r>);
 776
 777 enum _CaptureNames<'r> {
 778     Plugin(::std::slice::Iter<'r, Option<&'static str>>),
 779     Dynamic(::std::slice::Iter<'r, Option<String>>)
 780 }
 781
 782 impl<'r> Iterator for CaptureNames<'r> {
 783     type Item = Option<&'r str>;
 784
 785     fn next(&mut self) -> Option<Option<&'r str>> {
 786         match self.0 {
 787             _CaptureNames::Plugin(ref mut i) => i.next().cloned(),
 788             _CaptureNames::Dynamic(ref mut i) => {
 789                 i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref()))
 790             }
 791         }
 792     }
 793
 794     fn size_hint(&self) -> (usize, Option<usize>) {
 795         match self.0 {
 796             _CaptureNames::Plugin(ref i)  => i.size_hint(),
 797             _CaptureNames::Dynamic(ref i) => i.size_hint(),
 798         }
 799     }
 800 }
 801
 802 /// Yields all substrings delimited by a regular expression match.
 803 ///
 804 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 805 /// lifetime of the string being split.
 806 pub struct Split<'r, 't> {
 807     finder: Matches<'r, 't>,
 808     last: usize,
 809 }
 810
 811 impl<'r, 't> Iterator for Split<'r, 't> {
 812     type Item = &'t str;
 813
 814     fn next(&mut self) -> Option<&'t str> {
 815         let text = self.finder.text();
 816         match self.finder.next() {
 817             None => {
 818                 if self.last >= text.len() {
 819                     None
 820                 } else {
 821                     let s = &text[self.last..];
 822                     self.last = text.len();
 823                     Some(s)
 824                 }
 825             }
 826             Some(m) => {
 827                 let matched = &text[self.last..m.start()];
 828                 self.last = m.end();
 829                 Some(matched)
 830             }
 831         }
 832     }
 833 }
 834
 835 /// Yields at most `N` substrings delimited by a regular expression match.
 836 ///
 837 /// The last substring will be whatever remains after splitting.
 838 ///
 839 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 840 /// lifetime of the string being split.
 841 pub struct SplitN<'r, 't> {
 842     splits: Split<'r, 't>,
 843     n: usize,
 844 }
 845
 846 impl<'r, 't> Iterator for SplitN<'r, 't> {
 847     type Item = &'t str;
 848
 849     fn next(&mut self) -> Option<&'t str> {
 850         if self.n == 0 {
 851             return None
 852         }
 853         self.n -= 1;
 854         if self.n == 0 {
 855             let text = self.splits.finder.text();
 856             Some(&text[self.splits.last..])
 857         } else {
 858             self.splits.next()
 859         }
 860     }
 861 }
 862
 863 enum NamedGroups {
 864     Plugin(&'static [(&'static str, usize)]),
 865     Dynamic(Arc<HashMap<String, usize>>),
 866 }
 867
 868 impl NamedGroups {
 869     fn from_regex(regex: &Regex) -> NamedGroups {
 870         match regex.0 {
 871             _Regex::Plugin(ref plug) => NamedGroups::Plugin(&plug.groups),
 872             _Regex::Dynamic(ref exec) => {
 873                 NamedGroups::Dynamic(exec.capture_name_idx().clone())
 874             }
 875         }
 876     }
 877
 878     fn pos(&self, name: &str) -> Option<usize> {
 879         match *self {
 880             NamedGroups::Plugin(groups) => {
 881                 groups.binary_search_by(|&(n, _)| n.cmp(name))
 882                       .ok().map(|i| groups[i].1)
 883             },
 884             NamedGroups::Dynamic(ref groups) => {
 885                 groups.get(name).map(|i| *i)
 886             },
 887         }
 888     }
 889
 890     fn iter<'n>(&'n self) -> NamedGroupsIter<'n> {
 891         match *self {
 892             NamedGroups::Plugin(g) => NamedGroupsIter::Plugin(g.iter()),
 893             NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()),
 894         }
 895     }
 896 }
 897
 898 enum NamedGroupsIter<'n> {
 899     Plugin(::std::slice::Iter<'static, (&'static str, usize)>),
 900     Dynamic(::std::collections::hash_map::Iter<'n, String, usize>),
 901 }
 902
 903 impl<'n> Iterator for NamedGroupsIter<'n> {
 904     type Item = (&'n str, usize);
 905
 906     fn next(&mut self) -> Option<Self::Item> {
 907         match *self {
 908             NamedGroupsIter::Plugin(ref mut it) => it.next().map(|&v| v),
 909             NamedGroupsIter::Dynamic(ref mut it) => {
 910                 it.next().map(|(s, i)| (s.as_ref(), *i))
 911             }
 912         }
 913     }
 914 }
 915
 916 /// Captures represents a group of captured strings for a single match.
 917 ///
 918 /// The 0th capture always corresponds to the entire match. Each subsequent
 919 /// index corresponds to the next capture group in the regex. If a capture
 920 /// group is named, then the matched string is *also* available via the `name`
 921 /// method. (Note that the 0th capture is always unnamed and so must be
 922 /// accessed with the `get` method.)
 923 ///
 924 /// Positions returned from a capture group are always byte indices.
 925 ///
 926 /// `'t` is the lifetime of the matched text.
 927 pub struct Captures<'t> {
 928     text: &'t str,
 929     locs: Locations,
 930     named_groups: NamedGroups,
 931 }
 932
 933 impl<'t> Captures<'t> {
 934     /// Returns the match associated with the capture group at index `i`. If
 935     /// `i` does not correspond to a capture group, or if the capture group
 936     /// did not participate in the match, then `None` is returned.
 937     ///
 938     /// # Examples
 939     ///
 940     /// Get the text of the match with a default of an empty string if this
 941     /// group didn't participate in the match:
 942     ///
 943     /// ```rust
 944     /// # use regex::Regex;
 945     /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
 946     /// let caps = re.captures("abc123").unwrap();
 947     ///
 948     /// let text1 = caps.get(1).map_or("", |m| m.as_str());
 949     /// let text2 = caps.get(2).map_or("", |m| m.as_str());
 950     /// assert_eq!(text1, "123");
 951     /// assert_eq!(text2, "");
 952     /// ```
 953     pub fn get(&self, i: usize) -> Option<Match<'t>> {
 954         self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
 955     }
 956
 957     /// Returns the match for the capture group named `name`. If `name` isn't a
 958     /// valid capture group or didn't match anything, then `None` is returned.
 959     pub fn name(&self, name: &str) -> Option<Match<'t>> {
 960         self.named_groups.pos(name).and_then(|i| self.get(i))
 961     }
 962
 963     /// An iterator that yields all capturing matches in the order in which
 964     /// they appear in the regex. If a particular capture group didn't
 965     /// participate in the match, then `None` is yielded for that capture.
 966     ///
 967     /// The first match always corresponds to the overall match of the regex.
 968     pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
 969         SubCaptureMatches {
 970             caps: self,
 971             it: self.locs.iter(),
 972         }
 973     }
 974
 975     /// Expands all instances of `$name` in `text` to the corresponding capture
 976     /// group `name`, and writes them to the `dst` buffer given.
 977     ///
 978     /// `name` may be an integer corresponding to the index of the
 979     /// capture group (counted by order of opening parenthesis where `0` is the
 980     /// entire match) or it can be a name (consisting of letters, digits or
 981     /// underscores) corresponding to a named capture group.
 982     ///
 983     /// If `name` isn't a valid capture group (whether the name doesn't exist
 984     /// or isn't a valid index), then it is replaced with the empty string.
 985     ///
 986     /// The longest possible name is used. e.g., `$1a` looks up the capture
 987     /// group named `1a` and not the capture group at index `1`. To exert more
 988     /// precise control over the name, use braces, e.g., `${1}a`.
 989     ///
 990     /// To write a literal `$` use `$$`.
 991     pub fn expand(&self, replacement: &str, dst: &mut String) {
 992         expand_str(self, replacement, dst)
 993     }
 994
 995     /// Returns the number of captured groups.
 996     ///
 997     /// This is always at least `1`, since every regex has at least one capture
 998     /// group that corresponds to the full match.
 999     #[inline]
1000     pub fn len(&self) -> usize {
1001         self.locs.len()
1002     }
1003 }
1004
1005 impl<'t> fmt::Debug for Captures<'t> {
1006     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1007         f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
1008     }
1009 }
1010
1011 struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
1012
1013 impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
1014     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1015         // We'd like to show something nice here, even if it means an
1016         // allocation to build a reverse index.
1017         let slot_to_name: HashMap<usize, &str> =
1018             self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
1019         let mut map = f.debug_map();
1020         for (slot, m) in self.0.locs.iter().enumerate() {
1021             let m = m.map(|(s, e)| &self.0.text[s..e]);
1022             if let Some(ref name) = slot_to_name.get(&slot) {
1023                 map.entry(&name, &m);
1024             } else {
1025                 map.entry(&slot, &m);
1026             }
1027         }
1028         map.finish()
1029     }
1030 }
1031
1032 /// Get a group by index.
1033 ///
1034 /// `'t` is the lifetime of the matched text.
1035 ///
1036 /// The text can't outlive the `Captures` object if this method is
1037 /// used, because of how `Index` is defined (normally `a[i]` is part
1038 /// of `a` and can't outlive it); to do that, use `get()` instead.
1039 ///
1040 /// # Panics
1041 ///
1042 /// If there is no group at the given index.
1043 impl<'t> Index<usize> for Captures<'t> {
1044     type Output = str;
1045
1046     fn index(&self, i: usize) -> &str {
1047         self.get(i).map(|m| m.as_str())
1048             .unwrap_or_else(|| panic!("no group at index '{}'", i))
1049     }
1050 }
1051
1052 /// Get a group by name.
1053 ///
1054 /// `'t` is the lifetime of the matched text and `'i` is the lifetime
1055 /// of the group name (the index).
1056 ///
1057 /// The text can't outlive the `Captures` object if this method is
1058 /// used, because of how `Index` is defined (normally `a[i]` is part
1059 /// of `a` and can't outlive it); to do that, use `name` instead.
1060 ///
1061 /// # Panics
1062 ///
1063 /// If there is no group named by the given value.
1064 impl<'t, 'i> Index<&'i str> for Captures<'t> {
1065     type Output = str;
1066
1067     fn index<'a>(&'a self, name: &'i str) -> &'a str {
1068         self.name(name).map(|m| m.as_str())
1069             .unwrap_or_else(|| panic!("no group named '{}'", name))
1070     }
1071 }
1072
1073 /// An iterator that yields all capturing matches in the order in which they
1074 /// appear in the regex.
1075 ///
1076 /// If a particular capture group didn't participate in the match, then `None`
1077 /// is yielded for that capture. The first match always corresponds to the
1078 /// overall match of the regex.
1079 ///
1080 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
1081 /// the lifetime `'t` corresponds to the originally matched text.
1082 pub struct SubCaptureMatches<'c, 't: 'c> {
1083     caps: &'c Captures<'t>,
1084     it: SubCapturesPosIter<'c>,
1085 }
1086
1087 impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1088     type Item = Option<Match<'t>>;
1089
1090     fn next(&mut self) -> Option<Option<Match<'t>>> {
1091         self.it.next()
1092             .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
1093     }
1094 }
1095
1096 /// An iterator that yields all non-overlapping capture groups matching a
1097 /// particular regular expression.
1098 ///
1099 /// The iterator stops when no more matches can be found.
1100 ///
1101 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
1102 /// lifetime of the matched string.
1103 pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>);
1104
1105 enum CaptureMatchesInner<'r, 't> {
1106     Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>),
1107     Plugin(re_trait::CaptureMatches<'t, Plugin>),
1108 }
1109
1110 impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
1111     type Item = Captures<'t>;
1112
1113     fn next(&mut self) -> Option<Captures<'t>> {
1114         match self.0 {
1115             CaptureMatchesInner::Dynamic(ref mut it) => {
1116                 let named = it.regex().capture_name_idx().clone();
1117                 it.next().map(|locs| Captures {
1118                     text: it.text(),
1119                     locs: locs,
1120                     named_groups: NamedGroups::Dynamic(named),
1121                 })
1122             }
1123             CaptureMatchesInner::Plugin(ref mut it) => {
1124                 it.next().map(|locs| Captures {
1125                     text: it.text(),
1126                     locs: locs,
1127                     named_groups: NamedGroups::Plugin(it.regex().groups),
1128                 })
1129             }
1130         }
1131     }
1132 }
1133
1134 /// An iterator over all non-overlapping matches for a particular string.
1135 ///
1136 /// The iterator yields a `Match` value. The iterator stops when no more
1137 /// matches can be found.
1138 ///
1139 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
1140 /// lifetime of the matched string.
1141 pub struct Matches<'r, 't>(MatchesInner<'r, 't>);
1142
1143 enum MatchesInner<'r, 't> {
1144     Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>),
1145     Plugin(re_trait::Matches<'t, Plugin>),
1146 }
1147
1148 impl<'r, 't> Matches<'r, 't> {
1149     fn text(&self) -> &'t str {
1150         match self.0 {
1151             MatchesInner::Dynamic(ref it) => it.text(),
1152             MatchesInner::Plugin(ref it) => it.text(),
1153         }
1154     }
1155 }
1156
1157 impl<'r, 't> Iterator for Matches<'r, 't> {
1158     type Item = Match<'t>;
1159
1160     fn next(&mut self) -> Option<Match<'t>> {
1161         let text = self.text();
1162         match self.0 {
1163             MatchesInner::Dynamic(ref mut it) => {
1164                 it.next().map(|(s, e)| Match::new(text, s, e))
1165             }
1166             MatchesInner::Plugin(ref mut it) => {
1167                 it.next().map(|(s, e)| Match::new(text, s, e))
1168             }
1169         }
1170     }
1171 }
1172
1173 /// Replacer describes types that can be used to replace matches in a string.
1174 ///
1175 /// In general, users of this crate shouldn't need to implement this trait,
1176 /// since implementations are already provided for `&str` and
1177 /// `FnMut(&Captures) -> String`, which covers most use cases.
1178 pub trait Replacer {
1179     /// Appends text to `dst` to replace the current match.
1180     ///
1181     /// The current match is represented by `caps`, which is guaranteed to
1182     /// have a match at capture group `0`.
1183     ///
1184     /// For example, a no-op replacement would be
1185     /// `dst.extend(caps.get(0).unwrap().as_str())`.
1186     fn replace_append(&mut self, caps: &Captures, dst: &mut String);
1187
1188     /// Return a fixed unchanging replacement string.
1189     ///
1190     /// When doing replacements, if access to `Captures` is not needed (e.g.,
1191     /// the replacement byte string does not need `$` expansion), then it can
1192     /// be beneficial to avoid finding sub-captures.
1193     ///
1194     /// In general, this is called once for every call to `replacen`.
1195     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
1196         None
1197     }
1198 }
1199
1200 impl<'a> Replacer for &'a str {
1201     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1202         caps.expand(*self, dst);
1203     }
1204
1205     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
1206         match memchr(b'$', self.as_bytes()) {
1207             Some(_) => None,
1208             None => Some(Cow::Borrowed(*self)),
1209         }
1210     }
1211 }
1212
1213 impl<F> Replacer for F where F: FnMut(&Captures) -> String {
1214     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1215         dst.push_str(&(*self)(caps));
1216     }
1217 }
1218
1219 /// NoExpand indicates literal string replacement.
1220 ///
1221 /// It can be used with `replace` and `replace_all` to do a literal string
1222 /// replacement without expanding `$name` to their corresponding capture
1223 /// groups. This can be both convenient (to avoid escaping `$`, for example)
1224 /// and performant (since capture groups don't need to be found).
1225 ///
1226 /// `'t` is the lifetime of the literal text.
1227 pub struct NoExpand<'t>(pub &'t str);
1228
1229 impl<'t> Replacer for NoExpand<'t> {
1230     fn replace_append(&mut self, _: &Captures, dst: &mut String) {
1231         dst.push_str(self.0);
1232     }
1233
1234     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
1235         Some(Cow::Borrowed(self.0))
1236     }
1237 }