vendor/regex/src/re_unicode.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::borrow::Cow;
  12 use std::collections::HashMap;
  13 use std::fmt;
  14 use std::ops::Index;
  15 use std::str::FromStr;
  16 use std::sync::Arc;
  17
  18 use memchr::memchr;
  19 use syntax;
  20
  21 use error::Error;
  22 use exec::{Exec, ExecNoSyncStr};
  23 use expand::expand_str;
  24 use re_builder::unicode::RegexBuilder;
  25 use re_trait::{self, RegularExpression, SubCapturesPosIter};
  26
  27 /// Escapes all regular expression meta characters in `text`.
  28 ///
  29 /// The string returned may be safely used as a literal in a regular
  30 /// expression.
  31 pub fn escape(text: &str) -> String {
  32     syntax::escape(text)
  33 }
  34
  35 /// Match represents a single match of a regex in a haystack.
  36 ///
  37 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
  38 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
  39 pub struct Match<'t> {
  40     text: &'t str,
  41     start: usize,
  42     end: usize,
  43 }
  44
  45 impl<'t> Match<'t> {
  46     /// Returns the starting byte offset of the match in the haystack.
  47     #[inline]
  48     pub fn start(&self) -> usize {
  49         self.start
  50     }
  51
  52     /// Returns the ending byte offset of the match in the haystack.
  53     #[inline]
  54     pub fn end(&self) -> usize {
  55         self.end
  56     }
  57
  58     /// Returns the matched text.
  59     #[inline]
  60     pub fn as_str(&self) -> &'t str {
  61         &self.text[self.start..self.end]
  62     }
  63
  64     /// Creates a new match from the given haystack and byte offsets.
  65     #[inline]
  66     fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
  67         Match {
  68             text: haystack,
  69             start: start,
  70             end: end,
  71         }
  72     }
  73 }
  74
  75 impl<'t> From<Match<'t>> for &'t str {
  76     fn from(m: Match<'t>) -> &'t str {
  77         m.as_str()
  78     }
  79 }
  80
  81 /// A compiled regular expression for matching Unicode strings.
  82 ///
  83 /// It is represented as either a sequence of bytecode instructions (dynamic)
  84 /// or as a specialized Rust function (native). It can be used to search, split
  85 /// or replace text. All searching is done with an implicit `.*?` at the
  86 /// beginning and end of an expression. To force an expression to match the
  87 /// whole string (or a prefix or a suffix), you must use an anchor like `^` or
  88 /// `$` (or `\A` and `\z`).
  89 ///
  90 /// While this crate will handle Unicode strings (whether in the regular
  91 /// expression or in the search text), all positions returned are **byte
  92 /// indices**. Every byte index is guaranteed to be at a Unicode code point
  93 /// boundary.
  94 ///
  95 /// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
  96 /// compiled regular expression and text to search, respectively.
  97 ///
  98 /// The only methods that allocate new strings are the string replacement
  99 /// methods. All other methods (searching and splitting) return borrowed
 100 /// pointers into the string given.
 101 ///
 102 /// # Examples
 103 ///
 104 /// Find the location of a US phone number:
 105 ///
 106 /// ```rust
 107 /// # use regex::Regex;
 108 /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
 109 /// let mat = re.find("phone: 111-222-3333").unwrap();
 110 /// assert_eq!((mat.start(), mat.end()), (7, 19));
 111 /// ```
 112 ///
 113 /// # Using the `std::str::pattern` methods with `Regex`
 114 ///
 115 /// > **Note**: This section requires that this crate is compiled with the
 116 /// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
 117 ///
 118 /// Since `Regex` implements `Pattern`, you can use regexes with methods
 119 /// defined on `&str`. For example, `is_match`, `find`, `find_iter`
 120 /// and `split` can be replaced with `str::contains`, `str::find`,
 121 /// `str::match_indices` and `str::split`.
 122 ///
 123 /// Here are some examples:
 124 ///
 125 /// ```rust,ignore
 126 /// # use regex::Regex;
 127 /// let re = Regex::new(r"\d+").unwrap();
 128 /// let haystack = "a111b222c";
 129 ///
 130 /// assert!(haystack.contains(&re));
 131 /// assert_eq!(haystack.find(&re), Some(1));
 132 /// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
 133 ///            vec![(1, 4), (5, 8)]);
 134 /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
 135 /// ```
 136 #[derive(Clone)]
 137 pub struct Regex(Exec);
 138
 139 impl fmt::Display for Regex {
 140     /// Shows the original regular expression.
 141     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 142         write!(f, "{}", self.as_str())
 143     }
 144 }
 145
 146 impl fmt::Debug for Regex {
 147     /// Shows the original regular expression.
 148     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 149         fmt::Display::fmt(self, f)
 150     }
 151 }
 152
 153 #[doc(hidden)]
 154 impl From<Exec> for Regex {
 155     fn from(exec: Exec) -> Regex {
 156         Regex(exec)
 157     }
 158 }
 159
 160 impl FromStr for Regex {
 161     type Err = Error;
 162
 163     /// Attempts to parse a string into a regular expression
 164     fn from_str(s: &str) -> Result<Regex, Error> {
 165         Regex::new(s)
 166     }
 167 }
 168
 169 /// Core regular expression methods.
 170 impl Regex {
 171     /// Compiles a regular expression. Once compiled, it can be used repeatedly
 172     /// to search, split or replace text in a string.
 173     ///
 174     /// If an invalid expression is given, then an error is returned.
 175     pub fn new(re: &str) -> Result<Regex, Error> {
 176         RegexBuilder::new(re).build()
 177     }
 178
 179     /// Returns true if and only if the regex matches the string given.
 180     ///
 181     /// It is recommended to use this method if all you need to do is test
 182     /// a match, since the underlying matching engine may be able to do less
 183     /// work.
 184     ///
 185     /// # Example
 186     ///
 187     /// Test if some text contains at least one word with exactly 13
 188     /// Unicode word characters:
 189     ///
 190     /// ```rust
 191     /// # extern crate regex; use regex::Regex;
 192     /// # fn main() {
 193     /// let text = "I categorically deny having triskaidekaphobia.";
 194     /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
 195     /// # }
 196     /// ```
 197     pub fn is_match(&self, text: &str) -> bool {
 198         self.is_match_at(text, 0)
 199     }
 200
 201     /// Returns the start and end byte range of the leftmost-first match in
 202     /// `text`. If no match exists, then `None` is returned.
 203     ///
 204     /// Note that this should only be used if you want to discover the position
 205     /// of the match. Testing the existence of a match is faster if you use
 206     /// `is_match`.
 207     ///
 208     /// # Example
 209     ///
 210     /// Find the start and end location of the first word with exactly 13
 211     /// Unicode word characters:
 212     ///
 213     /// ```rust
 214     /// # extern crate regex; use regex::Regex;
 215     /// # fn main() {
 216     /// let text = "I categorically deny having triskaidekaphobia.";
 217     /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
 218     /// assert_eq!(mat.start(), 2);
 219     /// assert_eq!(mat.end(), 15);
 220     /// # }
 221     /// ```
 222     pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
 223         self.find_at(text, 0)
 224     }
 225
 226     /// Returns an iterator for each successive non-overlapping match in
 227     /// `text`, returning the start and end byte indices with respect to
 228     /// `text`.
 229     ///
 230     /// # Example
 231     ///
 232     /// Find the start and end location of every word with exactly 13 Unicode
 233     /// word characters:
 234     ///
 235     /// ```rust
 236     /// # extern crate regex; use regex::Regex;
 237     /// # fn main() {
 238     /// let text = "Retroactively relinquishing remunerations is reprehensible.";
 239     /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
 240     ///     println!("{:?}", mat);
 241     /// }
 242     /// # }
 243     /// ```
 244     pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
 245         Matches(self.0.searcher_str().find_iter(text))
 246     }
 247
 248     /// Returns the capture groups corresponding to the leftmost-first
 249     /// match in `text`. Capture group `0` always corresponds to the entire
 250     /// match. If no match is found, then `None` is returned.
 251     ///
 252     /// You should only use `captures` if you need access to the location of
 253     /// capturing group matches. Otherwise, `find` is faster for discovering
 254     /// the location of the overall match.
 255     ///
 256     /// # Examples
 257     ///
 258     /// Say you have some text with movie names and their release years,
 259     /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
 260     /// looking like that, while also extracting the movie name and its release
 261     /// year separately.
 262     ///
 263     /// ```rust
 264     /// # extern crate regex; use regex::Regex;
 265     /// # fn main() {
 266     /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
 267     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 268     /// let caps = re.captures(text).unwrap();
 269     /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
 270     /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
 271     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 272     /// // You can also access the groups by index using the Index notation.
 273     /// // Note that this will panic on an invalid index.
 274     /// assert_eq!(&caps[1], "Citizen Kane");
 275     /// assert_eq!(&caps[2], "1941");
 276     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 277     /// # }
 278     /// ```
 279     ///
 280     /// Note that the full match is at capture group `0`. Each subsequent
 281     /// capture group is indexed by the order of its opening `(`.
 282     ///
 283     /// We can make this example a bit clearer by using *named* capture groups:
 284     ///
 285     /// ```rust
 286     /// # extern crate regex; use regex::Regex;
 287     /// # fn main() {
 288     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 289     ///                .unwrap();
 290     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 291     /// let caps = re.captures(text).unwrap();
 292     /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane");
 293     /// assert_eq!(caps.name("year").unwrap().as_str(), "1941");
 294     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 295     /// // You can also access the groups by name using the Index notation.
 296     /// // Note that this will panic on an invalid group name.
 297     /// assert_eq!(&caps["title"], "Citizen Kane");
 298     /// assert_eq!(&caps["year"], "1941");
 299     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 300     ///
 301     /// # }
 302     /// ```
 303     ///
 304     /// Here we name the capture groups, which we can access with the `name`
 305     /// method or the `Index` notation with a `&str`. Note that the named
 306     /// capture groups are still accessible with `get` or the `Index` notation
 307     /// with a `usize`.
 308     ///
 309     /// The `0`th capture group is always unnamed, so it must always be
 310     /// accessed with `get(0)` or `[0]`.
 311     pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
 312         let mut locs = self.capture_locations();
 313         self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
 314             text: text,
 315             locs: locs.0,
 316             named_groups: self.0.capture_name_idx().clone(),
 317         })
 318     }
 319
 320     /// Returns an iterator over all the non-overlapping capture groups matched
 321     /// in `text`. This is operationally the same as `find_iter`, except it
 322     /// yields information about capturing group matches.
 323     ///
 324     /// # Example
 325     ///
 326     /// We can use this to find all movie titles and their release years in
 327     /// some text, where the movie is formatted like "'Title' (xxxx)":
 328     ///
 329     /// ```rust
 330     /// # extern crate regex; use regex::Regex;
 331     /// # fn main() {
 332     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 333     ///                .unwrap();
 334     /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
 335     /// for caps in re.captures_iter(text) {
 336     ///     println!("Movie: {:?}, Released: {:?}",
 337     ///              &caps["title"], &caps["year"]);
 338     /// }
 339     /// // Output:
 340     /// // Movie: Citizen Kane, Released: 1941
 341     /// // Movie: The Wizard of Oz, Released: 1939
 342     /// // Movie: M, Released: 1931
 343     /// # }
 344     /// ```
 345     pub fn captures_iter<'r, 't>(
 346         &'r self,
 347         text: &'t str,
 348     ) -> CaptureMatches<'r, 't> {
 349         CaptureMatches(self.0.searcher_str().captures_iter(text))
 350     }
 351
 352     /// Returns an iterator of substrings of `text` delimited by a match of the
 353     /// regular expression. Namely, each element of the iterator corresponds to
 354     /// text that *isn't* matched by the regular expression.
 355     ///
 356     /// This method will *not* copy the text given.
 357     ///
 358     /// # Example
 359     ///
 360     /// To split a string delimited by arbitrary amounts of spaces or tabs:
 361     ///
 362     /// ```rust
 363     /// # extern crate regex; use regex::Regex;
 364     /// # fn main() {
 365     /// let re = Regex::new(r"[ \t]+").unwrap();
 366     /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
 367     /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
 368     /// # }
 369     /// ```
 370     pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
 371         Split {
 372             finder: self.find_iter(text),
 373             last: 0,
 374         }
 375     }
 376
 377     /// Returns an iterator of at most `limit` substrings of `text` delimited
 378     /// by a match of the regular expression. (A `limit` of `0` will return no
 379     /// substrings.) Namely, each element of the iterator corresponds to text
 380     /// that *isn't* matched by the regular expression. The remainder of the
 381     /// string that is not split will be the last element in the iterator.
 382     ///
 383     /// This method will *not* copy the text given.
 384     ///
 385     /// # Example
 386     ///
 387     /// Get the first two words in some text:
 388     ///
 389     /// ```rust
 390     /// # extern crate regex; use regex::Regex;
 391     /// # fn main() {
 392     /// let re = Regex::new(r"\W+").unwrap();
 393     /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
 394     /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
 395     /// # }
 396     /// ```
 397     pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize)
 398                          -> SplitN<'r, 't> {
 399         SplitN {
 400             splits: self.split(text),
 401             n: limit,
 402         }
 403     }
 404
 405     /// Replaces the leftmost-first match with the replacement provided.
 406     /// The replacement can be a regular string (where `$N` and `$name` are
 407     /// expanded to match capture groups) or a function that takes the matches'
 408     /// `Captures` and returns the replaced string.
 409     ///
 410     /// If no match is found, then a copy of the string is returned unchanged.
 411     ///
 412     /// # Replacement string syntax
 413     ///
 414     /// All instances of `$name` in the replacement text is replaced with the
 415     /// corresponding capture group `name`.
 416     ///
 417     /// `name` may be an integer corresponding to the index of the
 418     /// capture group (counted by order of opening parenthesis where `0` is the
 419     /// entire match) or it can be a name (consisting of letters, digits or
 420     /// underscores) corresponding to a named capture group.
 421     ///
 422     /// If `name` isn't a valid capture group (whether the name doesn't exist
 423     /// or isn't a valid index), then it is replaced with the empty string.
 424     ///
 425     /// The longest possible name is used. e.g., `$1a` looks up the capture
 426     /// group named `1a` and not the capture group at index `1`. To exert more
 427     /// precise control over the name, use braces, e.g., `${1}a`.
 428     ///
 429     /// To write a literal `$` use `$$`.
 430     ///
 431     /// # Examples
 432     ///
 433     /// Note that this function is polymorphic with respect to the replacement.
 434     /// In typical usage, this can just be a normal string:
 435     ///
 436     /// ```rust
 437     /// # extern crate regex; use regex::Regex;
 438     /// # fn main() {
 439     /// let re = Regex::new("[^01]+").unwrap();
 440     /// assert_eq!(re.replace("1078910", ""), "1010");
 441     /// # }
 442     /// ```
 443     ///
 444     /// But anything satisfying the `Replacer` trait will work. For example,
 445     /// a closure of type `|&Captures| -> String` provides direct access to the
 446     /// captures corresponding to a match. This allows one to access
 447     /// capturing group matches easily:
 448     ///
 449     /// ```rust
 450     /// # extern crate regex; use regex::Regex;
 451     /// # use regex::Captures; fn main() {
 452     /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
 453     /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
 454     ///     format!("{} {}", &caps[2], &caps[1])
 455     /// });
 456     /// assert_eq!(result, "Bruce Springsteen");
 457     /// # }
 458     /// ```
 459     ///
 460     /// But this is a bit cumbersome to use all the time. Instead, a simple
 461     /// syntax is supported that expands `$name` into the corresponding capture
 462     /// group. Here's the last example, but using this expansion technique
 463     /// with named capture groups:
 464     ///
 465     /// ```rust
 466     /// # extern crate regex; use regex::Regex;
 467     /// # fn main() {
 468     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
 469     /// let result = re.replace("Springsteen, Bruce", "$first $last");
 470     /// assert_eq!(result, "Bruce Springsteen");
 471     /// # }
 472     /// ```
 473     ///
 474     /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
 475     /// would produce the same result. To write a literal `$` use `$$`.
 476     ///
 477     /// Sometimes the replacement string requires use of curly braces to
 478     /// delineate a capture group replacement and surrounding literal text.
 479     /// For example, if we wanted to join two words together with an
 480     /// underscore:
 481     ///
 482     /// ```rust
 483     /// # extern crate regex; use regex::Regex;
 484     /// # fn main() {
 485     /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
 486     /// let result = re.replace("deep fried", "${first}_$second");
 487     /// assert_eq!(result, "deep_fried");
 488     /// # }
 489     /// ```
 490     ///
 491     /// Without the curly braces, the capture group name `first_` would be
 492     /// used, and since it doesn't exist, it would be replaced with the empty
 493     /// string.
 494     ///
 495     /// Finally, sometimes you just want to replace a literal string with no
 496     /// regard for capturing group expansion. This can be done by wrapping a
 497     /// byte string with `NoExpand`:
 498     ///
 499     /// ```rust
 500     /// # extern crate regex; use regex::Regex;
 501     /// # fn main() {
 502     /// use regex::NoExpand;
 503     ///
 504     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
 505     /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
 506     /// assert_eq!(result, "$2 $last");
 507     /// # }
 508     /// ```
 509     pub fn replace<'t, R: Replacer>(
 510         &self,
 511         text: &'t str,
 512         rep: R,
 513     ) -> Cow<'t, str> {
 514         self.replacen(text, 1, rep)
 515     }
 516
 517     /// Replaces all non-overlapping matches in `text` with the replacement
 518     /// provided. This is the same as calling `replacen` with `limit` set to
 519     /// `0`.
 520     ///
 521     /// See the documentation for `replace` for details on how to access
 522     /// capturing group matches in the replacement string.
 523     pub fn replace_all<'t, R: Replacer>(
 524         &self,
 525         text: &'t str,
 526         rep: R,
 527     ) -> Cow<'t, str> {
 528         self.replacen(text, 0, rep)
 529     }
 530
 531     /// Replaces at most `limit` non-overlapping matches in `text` with the
 532     /// replacement provided. If `limit` is 0, then all non-overlapping matches
 533     /// are replaced.
 534     ///
 535     /// See the documentation for `replace` for details on how to access
 536     /// capturing group matches in the replacement string.
 537     pub fn replacen<'t, R: Replacer>(
 538         &self,
 539         text: &'t str,
 540         limit: usize,
 541         mut rep: R,
 542     ) -> Cow<'t, str> {
 543         // If we know that the replacement doesn't have any capture expansions,
 544         // then we can fast path. The fast path can make a tremendous
 545         // difference:
 546         //
 547         //   1) We use `find_iter` instead of `captures_iter`. Not asking for
 548         //      captures generally makes the regex engines faster.
 549         //   2) We don't need to look up all of the capture groups and do
 550         //      replacements inside the replacement string. We just push it
 551         //      at each match and be done with it.
 552         if let Some(rep) = rep.no_expansion() {
 553             let mut it = self.find_iter(text).enumerate().peekable();
 554             if it.peek().is_none() {
 555                 return Cow::Borrowed(text);
 556             }
 557             let mut new = String::with_capacity(text.len());
 558             let mut last_match = 0;
 559             for (i, m) in it {
 560                 if limit > 0 && i >= limit {
 561                     break
 562                 }
 563                 new.push_str(&text[last_match..m.start()]);
 564                 new.push_str(&rep);
 565                 last_match = m.end();
 566             }
 567             new.push_str(&text[last_match..]);
 568             return Cow::Owned(new);
 569         }
 570
 571         // The slower path, which we use if the replacement needs access to
 572         // capture groups.
 573         let mut it = self.captures_iter(text).enumerate().peekable();
 574         if it.peek().is_none() {
 575             return Cow::Borrowed(text);
 576         }
 577         let mut new = String::with_capacity(text.len());
 578         let mut last_match = 0;
 579         for (i, cap) in it {
 580             if limit > 0 && i >= limit {
 581                 break
 582             }
 583             // unwrap on 0 is OK because captures only reports matches
 584             let m = cap.get(0).unwrap();
 585             new.push_str(&text[last_match..m.start()]);
 586             rep.replace_append(&cap, &mut new);
 587             last_match = m.end();
 588         }
 589         new.push_str(&text[last_match..]);
 590         Cow::Owned(new)
 591     }
 592 }
 593
 594 /// Advanced or "lower level" search methods.
 595 impl Regex {
 596     /// Returns the end location of a match in the text given.
 597     ///
 598     /// This method may have the same performance characteristics as
 599     /// `is_match`, except it provides an end location for a match. In
 600     /// particular, the location returned *may be shorter* than the proper end
 601     /// of the leftmost-first match.
 602     ///
 603     /// # Example
 604     ///
 605     /// Typically, `a+` would match the entire first sequence of `a` in some
 606     /// text, but `shortest_match` can give up as soon as it sees the first
 607     /// `a`.
 608     ///
 609     /// ```rust
 610     /// # extern crate regex; use regex::Regex;
 611     /// # fn main() {
 612     /// let text = "aaaaa";
 613     /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
 614     /// assert_eq!(pos, Some(1));
 615     /// # }
 616     /// ```
 617     pub fn shortest_match(&self, text: &str) -> Option<usize> {
 618         self.shortest_match_at(text, 0)
 619     }
 620
 621     /// Returns the same as shortest_match, but starts the search at the given
 622     /// offset.
 623     ///
 624     /// The significance of the starting point is that it takes the surrounding
 625     /// context into consideration. For example, the `\A` anchor can only
 626     /// match when `start == 0`.
 627     pub fn shortest_match_at(
 628         &self,
 629         text: &str,
 630         start: usize,
 631     ) -> Option<usize> {
 632         self.0.searcher_str().shortest_match_at(text, start)
 633     }
 634
 635     /// Returns the same as is_match, but starts the search at the given
 636     /// offset.
 637     ///
 638     /// The significance of the starting point is that it takes the surrounding
 639     /// context into consideration. For example, the `\A` anchor can only
 640     /// match when `start == 0`.
 641     pub fn is_match_at(&self, text: &str, start: usize) -> bool {
 642         self.shortest_match_at(text, start).is_some()
 643     }
 644
 645     /// Returns the same as find, but starts the search at the given
 646     /// offset.
 647     ///
 648     /// The significance of the starting point is that it takes the surrounding
 649     /// context into consideration. For example, the `\A` anchor can only
 650     /// match when `start == 0`.
 651     pub fn find_at<'t>(
 652         &self,
 653         text: &'t str,
 654         start: usize,
 655     ) -> Option<Match<'t>> {
 656         self.0.searcher_str().find_at(text, start).map(|(s, e)| {
 657             Match::new(text, s, e)
 658         })
 659     }
 660
 661     /// This is like `captures`, but uses
 662     /// [`CaptureLocations`](struct.CaptureLocations.html)
 663     /// instead of
 664     /// [`Captures`](struct.Captures.html) in order to amortize allocations.
 665     ///
 666     /// To create a `CaptureLocations` value, use the
 667     /// `Regex::capture_locations` method.
 668     ///
 669     /// This returns the overall match if this was successful, which is always
 670     /// equivalence to the `0`th capture group.
 671     pub fn captures_read<'t>(
 672         &self,
 673         locs: &mut CaptureLocations,
 674         text: &'t str,
 675     ) -> Option<Match<'t>> {
 676         self.captures_read_at(locs, text, 0)
 677     }
 678
 679     /// Returns the same as captures, but starts the search at the given
 680     /// offset and populates the capture locations given.
 681     ///
 682     /// The significance of the starting point is that it takes the surrounding
 683     /// context into consideration. For example, the `\A` anchor can only
 684     /// match when `start == 0`.
 685     pub fn captures_read_at<'t>(
 686         &self,
 687         locs: &mut CaptureLocations,
 688         text: &'t str,
 689         start: usize,
 690     ) -> Option<Match<'t>> {
 691         self.0
 692             .searcher_str()
 693             .captures_read_at(&mut locs.0, text, start)
 694             .map(|(s, e)| Match::new(text, s, e))
 695     }
 696
 697     /// An undocumented alias for `captures_read_at`.
 698     ///
 699     /// The `regex-capi` crate previously used this routine, so to avoid
 700     /// breaking that crate, we continue to provide the name as an undocumented
 701     /// alias.
 702     #[doc(hidden)]
 703     pub fn read_captures_at<'t>(
 704         &self,
 705         locs: &mut CaptureLocations,
 706         text: &'t str,
 707         start: usize,
 708     ) -> Option<Match<'t>> {
 709         self.captures_read_at(locs, text, start)
 710     }
 711 }
 712
 713 /// Auxiliary methods.
 714 impl Regex {
 715     /// Returns the original string of this regex.
 716     pub fn as_str(&self) -> &str {
 717         &self.0.regex_strings()[0]
 718     }
 719
 720     /// Returns an iterator over the capture names.
 721     pub fn capture_names(&self) -> CaptureNames {
 722         CaptureNames(self.0.capture_names().iter())
 723     }
 724
 725     /// Returns the number of captures.
 726     pub fn captures_len(&self) -> usize {
 727         self.0.capture_names().len()
 728     }
 729
 730     /// Returns an empty set of capture locations that can be reused in
 731     /// multiple calls to `captures_read` or `captures_read_at`.
 732     pub fn capture_locations(&self) -> CaptureLocations {
 733         CaptureLocations(self.0.searcher_str().locations())
 734     }
 735
 736     /// An alias for `capture_locations` to preserve backward compatibility.
 737     ///
 738     /// The `regex-capi` crate uses this method, so to avoid breaking that
 739     /// crate, we continue to export it as an undocumented API.
 740     #[doc(hidden)]
 741     pub fn locations(&self) -> CaptureLocations {
 742         CaptureLocations(self.0.searcher_str().locations())
 743     }
 744 }
 745
 746 /// An iterator over the names of all possible captures.
 747 ///
 748 /// `None` indicates an unnamed capture; the first element (capture 0, the
 749 /// whole matched region) is always unnamed.
 750 ///
 751 /// `'r` is the lifetime of the compiled regular expression.
 752 pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
 753
 754 impl<'r> Iterator for CaptureNames<'r> {
 755     type Item = Option<&'r str>;
 756
 757     fn next(&mut self) -> Option<Option<&'r str>> {
 758         self.0
 759             .next()
 760             .as_ref()
 761             .map(|slot| slot.as_ref().map(|name| name.as_ref()))
 762     }
 763
 764     fn size_hint(&self) -> (usize, Option<usize>) {
 765         self.0.size_hint()
 766     }
 767 }
 768
 769 /// Yields all substrings delimited by a regular expression match.
 770 ///
 771 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 772 /// lifetime of the string being split.
 773 pub struct Split<'r, 't> {
 774     finder: Matches<'r, 't>,
 775     last: usize,
 776 }
 777
 778 impl<'r, 't> Iterator for Split<'r, 't> {
 779     type Item = &'t str;
 780
 781     fn next(&mut self) -> Option<&'t str> {
 782         let text = self.finder.0.text();
 783         match self.finder.next() {
 784             None => {
 785                 if self.last >= text.len() {
 786                     None
 787                 } else {
 788                     let s = &text[self.last..];
 789                     self.last = text.len();
 790                     Some(s)
 791                 }
 792             }
 793             Some(m) => {
 794                 let matched = &text[self.last..m.start()];
 795                 self.last = m.end();
 796                 Some(matched)
 797             }
 798         }
 799     }
 800 }
 801
 802 /// Yields at most `N` substrings delimited by a regular expression match.
 803 ///
 804 /// The last substring will be whatever remains after splitting.
 805 ///
 806 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 807 /// lifetime of the string being split.
 808 pub struct SplitN<'r, 't> {
 809     splits: Split<'r, 't>,
 810     n: usize,
 811 }
 812
 813 impl<'r, 't> Iterator for SplitN<'r, 't> {
 814     type Item = &'t str;
 815
 816     fn next(&mut self) -> Option<&'t str> {
 817         if self.n == 0 {
 818             return None
 819         }
 820         self.n -= 1;
 821         if self.n == 0 {
 822             let text = self.splits.finder.0.text();
 823             Some(&text[self.splits.last..])
 824         } else {
 825             self.splits.next()
 826         }
 827     }
 828 }
 829
 830 /// CaptureLocations is a low level representation of the raw offsets of each
 831 /// submatch.
 832 ///
 833 /// You can think of this as a lower level
 834 /// [`Captures`](struct.Captures.html), where this type does not support
 835 /// named capturing groups directly and it does not borrow the text that these
 836 /// offsets were matched on.
 837 ///
 838 /// Primarily, this type is useful when using the lower level `Regex` APIs
 839 /// such as `read_captures`, which permits amortizing the allocation in which
 840 /// capture match locations are stored.
 841 ///
 842 /// In order to build a value of this type, you'll need to call the
 843 /// `capture_locations` method on the `Regex` being used to execute the search.
 844 /// The value returned can then be reused in subsequent searches.
 845 #[derive(Clone, Debug)]
 846 pub struct CaptureLocations(re_trait::Locations);
 847
 848 /// A type alias for `CaptureLocations` for backwards compatibility.
 849 ///
 850 /// Previously, we exported `CaptureLocations` as `Locations` in an
 851 /// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
 852 /// we continue re-exporting the same undocumented API.
 853 #[doc(hidden)]
 854 pub type Locations = CaptureLocations;
 855
 856 impl CaptureLocations {
 857     /// Returns the start and end positions of the Nth capture group. Returns
 858     /// `None` if `i` is not a valid capture group or if the capture group did
 859     /// not match anything. The positions returned are *always* byte indices
 860     /// with respect to the original string matched.
 861     #[inline]
 862     pub fn get(&self, i: usize) -> Option<(usize, usize)> {
 863         self.0.pos(i)
 864     }
 865
 866     /// Returns the total number of capturing groups.
 867     ///
 868     /// This is always at least `1` since every regex has at least `1`
 869     /// capturing group that corresponds to the entire match.
 870     #[inline]
 871     pub fn len(&self) -> usize {
 872         self.0.len()
 873     }
 874
 875     /// An alias for the `get` method for backwards compatibility.
 876     ///
 877     /// Previously, we exported `get` as `pos` in an undocumented API. To
 878     /// prevent breaking that code (e.g., in `regex-capi`), we continue
 879     /// re-exporting the same undocumented API.
 880     #[doc(hidden)]
 881     #[inline]
 882     pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
 883         self.get(i)
 884     }
 885 }
 886
 887 /// Captures represents a group of captured strings for a single match.
 888 ///
 889 /// The 0th capture always corresponds to the entire match. Each subsequent
 890 /// index corresponds to the next capture group in the regex. If a capture
 891 /// group is named, then the matched string is *also* available via the `name`
 892 /// method. (Note that the 0th capture is always unnamed and so must be
 893 /// accessed with the `get` method.)
 894 ///
 895 /// Positions returned from a capture group are always byte indices.
 896 ///
 897 /// `'t` is the lifetime of the matched text.
 898 pub struct Captures<'t> {
 899     text: &'t str,
 900     locs: re_trait::Locations,
 901     named_groups: Arc<HashMap<String, usize>>,
 902 }
 903
 904 impl<'t> Captures<'t> {
 905     /// Returns the match associated with the capture group at index `i`. If
 906     /// `i` does not correspond to a capture group, or if the capture group
 907     /// did not participate in the match, then `None` is returned.
 908     ///
 909     /// # Examples
 910     ///
 911     /// Get the text of the match with a default of an empty string if this
 912     /// group didn't participate in the match:
 913     ///
 914     /// ```rust
 915     /// # use regex::Regex;
 916     /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
 917     /// let caps = re.captures("abc123").unwrap();
 918     ///
 919     /// let text1 = caps.get(1).map_or("", |m| m.as_str());
 920     /// let text2 = caps.get(2).map_or("", |m| m.as_str());
 921     /// assert_eq!(text1, "123");
 922     /// assert_eq!(text2, "");
 923     /// ```
 924     pub fn get(&self, i: usize) -> Option<Match<'t>> {
 925         self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
 926     }
 927
 928     /// Returns the match for the capture group named `name`. If `name` isn't a
 929     /// valid capture group or didn't match anything, then `None` is returned.
 930     pub fn name(&self, name: &str) -> Option<Match<'t>> {
 931         self.named_groups.get(name).and_then(|&i| self.get(i))
 932     }
 933
 934     /// An iterator that yields all capturing matches in the order in which
 935     /// they appear in the regex. If a particular capture group didn't
 936     /// participate in the match, then `None` is yielded for that capture.
 937     ///
 938     /// The first match always corresponds to the overall match of the regex.
 939     pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
 940         SubCaptureMatches {
 941             caps: self,
 942             it: self.locs.iter(),
 943         }
 944     }
 945
 946     /// Expands all instances of `$name` in `replacement` to the corresponding
 947     /// capture group `name`, and writes them to the `dst` buffer given.
 948     ///
 949     /// `name` may be an integer corresponding to the index of the
 950     /// capture group (counted by order of opening parenthesis where `0` is the
 951     /// entire match) or it can be a name (consisting of letters, digits or
 952     /// underscores) corresponding to a named capture group.
 953     ///
 954     /// If `name` isn't a valid capture group (whether the name doesn't exist
 955     /// or isn't a valid index), then it is replaced with the empty string.
 956     ///
 957     /// The longest possible name is used. e.g., `$1a` looks up the capture
 958     /// group named `1a` and not the capture group at index `1`. To exert more
 959     /// precise control over the name, use braces, e.g., `${1}a`.
 960     ///
 961     /// To write a literal `$` use `$$`.
 962     pub fn expand(&self, replacement: &str, dst: &mut String) {
 963         expand_str(self, replacement, dst)
 964     }
 965
 966     /// Returns the number of captured groups.
 967     ///
 968     /// This is always at least `1`, since every regex has at least one capture
 969     /// group that corresponds to the full match.
 970     #[inline]
 971     pub fn len(&self) -> usize {
 972         self.locs.len()
 973     }
 974 }
 975
 976 impl<'t> fmt::Debug for Captures<'t> {
 977     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 978         f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
 979     }
 980 }
 981
 982 struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
 983
 984 impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
 985     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 986         // We'd like to show something nice here, even if it means an
 987         // allocation to build a reverse index.
 988         let slot_to_name: HashMap<&usize, &String> =
 989             self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
 990         let mut map = f.debug_map();
 991         for (slot, m) in self.0.locs.iter().enumerate() {
 992             let m = m.map(|(s, e)| &self.0.text[s..e]);
 993             if let Some(name) = slot_to_name.get(&slot) {
 994                 map.entry(&name, &m);
 995             } else {
 996                 map.entry(&slot, &m);
 997             }
 998         }
 999         map.finish()
1000     }
1001 }
1002
1003 /// Get a group by index.
1004 ///
1005 /// `'t` is the lifetime of the matched text.
1006 ///
1007 /// The text can't outlive the `Captures` object if this method is
1008 /// used, because of how `Index` is defined (normally `a[i]` is part
1009 /// of `a` and can't outlive it); to do that, use `get()` instead.
1010 ///
1011 /// # Panics
1012 ///
1013 /// If there is no group at the given index.
1014 impl<'t> Index<usize> for Captures<'t> {
1015     type Output = str;
1016
1017     fn index(&self, i: usize) -> &str {
1018         self.get(i).map(|m| m.as_str())
1019             .unwrap_or_else(|| panic!("no group at index '{}'", i))
1020     }
1021 }
1022
1023 /// Get a group by name.
1024 ///
1025 /// `'t` is the lifetime of the matched text and `'i` is the lifetime
1026 /// of the group name (the index).
1027 ///
1028 /// The text can't outlive the `Captures` object if this method is
1029 /// used, because of how `Index` is defined (normally `a[i]` is part
1030 /// of `a` and can't outlive it); to do that, use `name` instead.
1031 ///
1032 /// # Panics
1033 ///
1034 /// If there is no group named by the given value.
1035 impl<'t, 'i> Index<&'i str> for Captures<'t> {
1036     type Output = str;
1037
1038     fn index<'a>(&'a self, name: &'i str) -> &'a str {
1039         self.name(name).map(|m| m.as_str())
1040             .unwrap_or_else(|| panic!("no group named '{}'", name))
1041     }
1042 }
1043
1044 /// An iterator that yields all capturing matches in the order in which they
1045 /// appear in the regex.
1046 ///
1047 /// If a particular capture group didn't participate in the match, then `None`
1048 /// is yielded for that capture. The first match always corresponds to the
1049 /// overall match of the regex.
1050 ///
1051 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
1052 /// the lifetime `'t` corresponds to the originally matched text.
1053 pub struct SubCaptureMatches<'c, 't: 'c> {
1054     caps: &'c Captures<'t>,
1055     it: SubCapturesPosIter<'c>,
1056 }
1057
1058 impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1059     type Item = Option<Match<'t>>;
1060
1061     fn next(&mut self) -> Option<Option<Match<'t>>> {
1062         self.it.next()
1063             .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
1064     }
1065 }
1066
1067 /// An iterator that yields all non-overlapping capture groups matching a
1068 /// particular regular expression.
1069 ///
1070 /// The iterator stops when no more matches can be found.
1071 ///
1072 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
1073 /// lifetime of the matched string.
1074 pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>);
1075
1076 impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
1077     type Item = Captures<'t>;
1078
1079     fn next(&mut self) -> Option<Captures<'t>> {
1080         self.0.next().map(|locs| Captures {
1081             text: self.0.text(),
1082             locs: locs,
1083             named_groups: self.0.regex().capture_name_idx().clone(),
1084         })
1085     }
1086 }
1087
1088 /// An iterator over all non-overlapping matches for a particular string.
1089 ///
1090 /// The iterator yields a `Match` value. The iterator stops when no more
1091 /// matches can be found.
1092 ///
1093 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
1094 /// lifetime of the matched string.
1095 pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
1096
1097 impl<'r, 't> Iterator for Matches<'r, 't> {
1098     type Item = Match<'t>;
1099
1100     fn next(&mut self) -> Option<Match<'t>> {
1101         let text = self.0.text();
1102         self.0.next().map(|(s, e)| Match::new(text, s, e))
1103     }
1104 }
1105
1106 /// Replacer describes types that can be used to replace matches in a string.
1107 ///
1108 /// In general, users of this crate shouldn't need to implement this trait,
1109 /// since implementations are already provided for `&str` and
1110 /// `FnMut(&Captures) -> String` (or any `FnMut(&Captures) -> T`
1111 /// where `T: AsRef<str>`), which covers most use cases.
1112 pub trait Replacer {
1113     /// Appends text to `dst` to replace the current match.
1114     ///
1115     /// The current match is represented by `caps`, which is guaranteed to
1116     /// have a match at capture group `0`.
1117     ///
1118     /// For example, a no-op replacement would be
1119     /// `dst.extend(caps.get(0).unwrap().as_str())`.
1120     fn replace_append(&mut self, caps: &Captures, dst: &mut String);
1121
1122     /// Return a fixed unchanging replacement string.
1123     ///
1124     /// When doing replacements, if access to `Captures` is not needed (e.g.,
1125     /// the replacement byte string does not need `$` expansion), then it can
1126     /// be beneficial to avoid finding sub-captures.
1127     ///
1128     /// In general, this is called once for every call to `replacen`.
1129     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
1130         None
1131     }
1132
1133     /// Return a `Replacer` that borrows and wraps this `Replacer`.
1134     ///
1135     /// This is useful when you want to take a generic `Replacer` (which might
1136     /// not be cloneable) and use it without consuming it, so it can be used
1137     /// more than once.
1138     ///
1139     /// # Example
1140     ///
1141     /// ```
1142     /// use regex::{Regex, Replacer};
1143     ///
1144     /// fn replace_all_twice<R: Replacer>(
1145     ///     re: Regex,
1146     ///     src: &str,
1147     ///     mut rep: R,
1148     /// ) -> String {
1149     ///     let dst = re.replace_all(src, rep.by_ref());
1150     ///     let dst = re.replace_all(&dst, rep.by_ref());
1151     ///     dst.into_owned()
1152     /// }
1153     /// ```
1154     fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> {
1155         ReplacerRef(self)
1156     }
1157 }
1158
1159 /// By-reference adaptor for a `Replacer`
1160 ///
1161 /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
1162 #[derive(Debug)]
1163 pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R);
1164
1165 impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
1166     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1167         self.0.replace_append(caps, dst)
1168     }
1169     fn no_expansion(&mut self) -> Option<Cow<str>> {
1170         self.0.no_expansion()
1171     }
1172 }
1173
1174 impl<'a> Replacer for &'a str {
1175     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1176         caps.expand(*self, dst);
1177     }
1178
1179     fn no_expansion(&mut self) -> Option<Cow<str>> {
1180         match memchr(b'$', self.as_bytes()) {
1181             Some(_) => None,
1182             None => Some(Cow::Borrowed(*self)),
1183         }
1184     }
1185 }
1186
1187 impl<F, T> Replacer for F where F: FnMut(&Captures) -> T, T: AsRef<str> {
1188     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1189         dst.push_str((*self)(caps).as_ref());
1190     }
1191 }
1192
1193 /// `NoExpand` indicates literal string replacement.
1194 ///
1195 /// It can be used with `replace` and `replace_all` to do a literal string
1196 /// replacement without expanding `$name` to their corresponding capture
1197 /// groups. This can be both convenient (to avoid escaping `$`, for example)
1198 /// and performant (since capture groups don't need to be found).
1199 ///
1200 /// `'t` is the lifetime of the literal text.
1201 pub struct NoExpand<'t>(pub &'t str);
1202
1203 impl<'t> Replacer for NoExpand<'t> {
1204     fn replace_append(&mut self, _: &Captures, dst: &mut String) {
1205         dst.push_str(self.0);
1206     }
1207
1208     fn no_expansion(&mut self) -> Option<Cow<str>> {
1209         Some(Cow::Borrowed(self.0))
1210     }
1211 }