src/vendor/regex/src/re_unicode.rs

   1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use std::borrow::Cow;
  12 use std::collections::HashMap;
  13 use std::fmt;
  14 use std::ops::Index;
  15 use std::str::FromStr;
  16 use std::sync::Arc;
  17
  18 use memchr::memchr;
  19 use syntax;
  20
  21 use error::Error;
  22 use exec::{Exec, ExecNoSyncStr};
  23 use expand::expand_str;
  24 use re_builder::unicode::RegexBuilder;
  25 use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
  26
  27 /// Escapes all regular expression meta characters in `text`.
  28 ///
  29 /// The string returned may be safely used as a literal in a regular
  30 /// expression.
  31 pub fn escape(text: &str) -> String {
  32     syntax::escape(text)
  33 }
  34
  35 /// Match represents a single match of a regex in a haystack.
  36 ///
  37 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
  38 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
  39 pub struct Match<'t> {
  40     text: &'t str,
  41     start: usize,
  42     end: usize,
  43 }
  44
  45 impl<'t> Match<'t> {
  46     /// Returns the starting byte offset of the match in the haystack.
  47     #[inline]
  48     pub fn start(&self) -> usize {
  49         self.start
  50     }
  51
  52     /// Returns the ending byte offset of the match in the haystack.
  53     #[inline]
  54     pub fn end(&self) -> usize {
  55         self.end
  56     }
  57
  58     /// Returns the matched text.
  59     #[inline]
  60     pub fn as_str(&self) -> &'t str {
  61         &self.text[self.start..self.end]
  62     }
  63
  64     /// Creates a new match from the given haystack and byte offsets.
  65     #[inline]
  66     fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
  67         Match {
  68             text: haystack,
  69             start: start,
  70             end: end,
  71         }
  72     }
  73 }
  74
  75 impl<'t> From<Match<'t>> for &'t str {
  76     fn from(m: Match<'t>) -> &'t str {
  77         m.as_str()
  78     }
  79 }
  80
  81 /// A compiled regular expression for matching Unicode strings.
  82 ///
  83 /// It is represented as either a sequence of bytecode instructions (dynamic)
  84 /// or as a specialized Rust function (native). It can be used to search, split
  85 /// or replace text. All searching is done with an implicit `.*?` at the
  86 /// beginning and end of an expression. To force an expression to match the
  87 /// whole string (or a prefix or a suffix), you must use an anchor like `^` or
  88 /// `$` (or `\A` and `\z`).
  89 ///
  90 /// While this crate will handle Unicode strings (whether in the regular
  91 /// expression or in the search text), all positions returned are **byte
  92 /// indices**. Every byte index is guaranteed to be at a Unicode code point
  93 /// boundary.
  94 ///
  95 /// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a
  96 /// compiled regular expression and text to search, respectively.
  97 ///
  98 /// The only methods that allocate new strings are the string replacement
  99 /// methods. All other methods (searching and splitting) return borrowed
 100 /// pointers into the string given.
 101 ///
 102 /// # Examples
 103 ///
 104 /// Find the location of a US phone number:
 105 ///
 106 /// ```rust
 107 /// # use regex::Regex;
 108 /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap();
 109 /// let mat = re.find("phone: 111-222-3333").unwrap();
 110 /// assert_eq!((mat.start(), mat.end()), (7, 19));
 111 /// ```
 112 ///
 113 /// # Using the `std::str::pattern` methods with `Regex`
 114 ///
 115 /// > **Note**: This section requires that this crate is compiled with the
 116 /// > `pattern` Cargo feature enabled, which **requires nightly Rust**.
 117 ///
 118 /// Since `Regex` implements `Pattern`, you can use regexes with methods
 119 /// defined on `&str`. For example, `is_match`, `find`, `find_iter`
 120 /// and `split` can be replaced with `str::contains`, `str::find`,
 121 /// `str::match_indices` and `str::split`.
 122 ///
 123 /// Here are some examples:
 124 ///
 125 /// ```rust,ignore
 126 /// # use regex::Regex;
 127 /// let re = Regex::new(r"\d+").unwrap();
 128 /// let haystack = "a111b222c";
 129 ///
 130 /// assert!(haystack.contains(&re));
 131 /// assert_eq!(haystack.find(&re), Some(1));
 132 /// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(),
 133 ///            vec![(1, 4), (5, 8)]);
 134 /// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]);
 135 /// ```
 136 #[derive(Clone)]
 137 pub struct Regex(Exec);
 138
 139 impl fmt::Display for Regex {
 140     /// Shows the original regular expression.
 141     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 142         write!(f, "{}", self.as_str())
 143     }
 144 }
 145
 146 impl fmt::Debug for Regex {
 147     /// Shows the original regular expression.
 148     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 149         fmt::Display::fmt(self, f)
 150     }
 151 }
 152
 153 #[doc(hidden)]
 154 impl From<Exec> for Regex {
 155     fn from(exec: Exec) -> Regex {
 156         Regex(exec)
 157     }
 158 }
 159
 160 impl FromStr for Regex {
 161     type Err = Error;
 162
 163     /// Attempts to parse a string into a regular expression
 164     fn from_str(s: &str) -> Result<Regex, Error> {
 165         Regex::new(s)
 166     }
 167 }
 168
 169 /// Core regular expression methods.
 170 impl Regex {
 171     /// Compiles a regular expression. Once compiled, it can be used repeatedly
 172     /// to search, split or replace text in a string.
 173     ///
 174     /// If an invalid expression is given, then an error is returned.
 175     pub fn new(re: &str) -> Result<Regex, Error> {
 176         RegexBuilder::new(re).build()
 177     }
 178
 179     /// Returns true if and only if the regex matches the string given.
 180     ///
 181     /// It is recommended to use this method if all you need to do is test
 182     /// a match, since the underlying matching engine may be able to do less
 183     /// work.
 184     ///
 185     /// # Example
 186     ///
 187     /// Test if some text contains at least one word with exactly 13
 188     /// Unicode word characters:
 189     ///
 190     /// ```rust
 191     /// # extern crate regex; use regex::Regex;
 192     /// # fn main() {
 193     /// let text = "I categorically deny having triskaidekaphobia.";
 194     /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
 195     /// # }
 196     /// ```
 197     pub fn is_match(&self, text: &str) -> bool {
 198         self.is_match_at(text, 0)
 199     }
 200
 201     /// Returns the start and end byte range of the leftmost-first match in
 202     /// `text`. If no match exists, then `None` is returned.
 203     ///
 204     /// Note that this should only be used if you want to discover the position
 205     /// of the match. Testing the existence of a match is faster if you use
 206     /// `is_match`.
 207     ///
 208     /// # Example
 209     ///
 210     /// Find the start and end location of the first word with exactly 13
 211     /// Unicode word characters:
 212     ///
 213     /// ```rust
 214     /// # extern crate regex; use regex::Regex;
 215     /// # fn main() {
 216     /// let text = "I categorically deny having triskaidekaphobia.";
 217     /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
 218     /// assert_eq!(mat.start(), 2);
 219     /// assert_eq!(mat.end(), 15);
 220     /// # }
 221     /// ```
 222     pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
 223         self.find_at(text, 0)
 224     }
 225
 226     /// Returns an iterator for each successive non-overlapping match in
 227     /// `text`, returning the start and end byte indices with respect to
 228     /// `text`.
 229     ///
 230     /// # Example
 231     ///
 232     /// Find the start and end location of every word with exactly 13 Unicode
 233     /// word characters:
 234     ///
 235     /// ```rust
 236     /// # extern crate regex; use regex::Regex;
 237     /// # fn main() {
 238     /// let text = "Retroactively relinquishing remunerations is reprehensible.";
 239     /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
 240     ///     println!("{:?}", mat);
 241     /// }
 242     /// # }
 243     /// ```
 244     pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
 245         Matches(self.0.searcher_str().find_iter(text))
 246     }
 247
 248     /// Returns the capture groups corresponding to the leftmost-first
 249     /// match in `text`. Capture group `0` always corresponds to the entire
 250     /// match. If no match is found, then `None` is returned.
 251     ///
 252     /// You should only use `captures` if you need access to the location of
 253     /// capturing group matches. Otherwise, `find` is faster for discovering
 254     /// the location of the overall match.
 255     ///
 256     /// # Examples
 257     ///
 258     /// Say you have some text with movie names and their release years,
 259     /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text
 260     /// looking like that, while also extracting the movie name and its release
 261     /// year separately.
 262     ///
 263     /// ```rust
 264     /// # extern crate regex; use regex::Regex;
 265     /// # fn main() {
 266     /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
 267     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 268     /// let caps = re.captures(text).unwrap();
 269     /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane");
 270     /// assert_eq!(caps.get(2).unwrap().as_str(), "1941");
 271     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 272     /// // You can also access the groups by index using the Index notation.
 273     /// // Note that this will panic on an invalid index.
 274     /// assert_eq!(&caps[1], "Citizen Kane");
 275     /// assert_eq!(&caps[2], "1941");
 276     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 277     /// # }
 278     /// ```
 279     ///
 280     /// Note that the full match is at capture group `0`. Each subsequent
 281     /// capture group is indexed by the order of its opening `(`.
 282     ///
 283     /// We can make this example a bit clearer by using *named* capture groups:
 284     ///
 285     /// ```rust
 286     /// # extern crate regex; use regex::Regex;
 287     /// # fn main() {
 288     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 289     ///                .unwrap();
 290     /// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
 291     /// let caps = re.captures(text).unwrap();
 292     /// assert_eq!(&caps["title"], "Citizen Kane");
 293     /// assert_eq!(&caps["year"], "1941");
 294     /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)");
 295     /// // You can also access the groups by name using the Index notation.
 296     /// // Note that this will panic on an invalid group name.
 297     /// assert_eq!(&caps["title"], "Citizen Kane");
 298     /// assert_eq!(&caps["year"], "1941");
 299     /// assert_eq!(&caps[0], "'Citizen Kane' (1941)");
 300     ///
 301     /// # }
 302     /// ```
 303     ///
 304     /// Here we name the capture groups, which we can access with the `name`
 305     /// method or the `Index` notation with a `&str`. Note that the named
 306     /// capture groups are still accessible with `get` or the `Index` notation
 307     /// with a `usize`.
 308     ///
 309     /// The `0`th capture group is always unnamed, so it must always be
 310     /// accessed with `get(0)` or `[0]`.
 311     pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
 312         let mut locs = self.locations();
 313         self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
 314             text: text,
 315             locs: locs,
 316             named_groups: self.0.capture_name_idx().clone(),
 317         })
 318     }
 319
 320     /// Returns an iterator over all the non-overlapping capture groups matched
 321     /// in `text`. This is operationally the same as `find_iter`, except it
 322     /// yields information about capturing group matches.
 323     ///
 324     /// # Example
 325     ///
 326     /// We can use this to find all movie titles and their release years in
 327     /// some text, where the movie is formatted like "'Title' (xxxx)":
 328     ///
 329     /// ```rust
 330     /// # extern crate regex; use regex::Regex;
 331     /// # fn main() {
 332     /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
 333     ///                .unwrap();
 334     /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
 335     /// for caps in re.captures_iter(text) {
 336     ///     println!("Movie: {:?}, Released: {:?}",
 337     ///              &caps["title"], &caps["year"]);
 338     /// }
 339     /// // Output:
 340     /// // Movie: Citizen Kane, Released: 1941
 341     /// // Movie: The Wizard of Oz, Released: 1939
 342     /// // Movie: M, Released: 1931
 343     /// # }
 344     /// ```
 345     pub fn captures_iter<'r, 't>(
 346         &'r self,
 347         text: &'t str,
 348     ) -> CaptureMatches<'r, 't> {
 349         CaptureMatches(self.0.searcher_str().captures_iter(text))
 350     }
 351
 352     /// Returns an iterator of substrings of `text` delimited by a match of the
 353     /// regular expression. Namely, each element of the iterator corresponds to
 354     /// text that *isn't* matched by the regular expression.
 355     ///
 356     /// This method will *not* copy the text given.
 357     ///
 358     /// # Example
 359     ///
 360     /// To split a string delimited by arbitrary amounts of spaces or tabs:
 361     ///
 362     /// ```rust
 363     /// # extern crate regex; use regex::Regex;
 364     /// # fn main() {
 365     /// let re = Regex::new(r"[ \t]+").unwrap();
 366     /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
 367     /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
 368     /// # }
 369     /// ```
 370     pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> {
 371         Split {
 372             finder: self.find_iter(text),
 373             last: 0,
 374         }
 375     }
 376
 377     /// Returns an iterator of at most `limit` substrings of `text` delimited
 378     /// by a match of the regular expression. (A `limit` of `0` will return no
 379     /// substrings.) Namely, each element of the iterator corresponds to text
 380     /// that *isn't* matched by the regular expression. The remainder of the
 381     /// string that is not split will be the last element in the iterator.
 382     ///
 383     /// This method will *not* copy the text given.
 384     ///
 385     /// # Example
 386     ///
 387     /// Get the first two words in some text:
 388     ///
 389     /// ```rust
 390     /// # extern crate regex; use regex::Regex;
 391     /// # fn main() {
 392     /// let re = Regex::new(r"\W+").unwrap();
 393     /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
 394     /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
 395     /// # }
 396     /// ```
 397     pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize)
 398                          -> SplitN<'r, 't> {
 399         SplitN {
 400             splits: self.split(text),
 401             n: limit,
 402         }
 403     }
 404
 405     /// Replaces the leftmost-first match with the replacement provided.
 406     /// The replacement can be a regular string (where `$N` and `$name` are
 407     /// expanded to match capture groups) or a function that takes the matches'
 408     /// `Captures` and returns the replaced string.
 409     ///
 410     /// If no match is found, then a copy of the string is returned unchanged.
 411     ///
 412     /// # Replacement string syntax
 413     ///
 414     /// All instances of `$name` in the replacement text is replaced with the
 415     /// corresponding capture group `name`.
 416     ///
 417     /// `name` may be an integer corresponding to the index of the
 418     /// capture group (counted by order of opening parenthesis where `0` is the
 419     /// entire match) or it can be a name (consisting of letters, digits or
 420     /// underscores) corresponding to a named capture group.
 421     ///
 422     /// If `name` isn't a valid capture group (whether the name doesn't exist
 423     /// or isn't a valid index), then it is replaced with the empty string.
 424     ///
 425     /// The longest possible name is used. e.g., `$1a` looks up the capture
 426     /// group named `1a` and not the capture group at index `1`. To exert more
 427     /// precise control over the name, use braces, e.g., `${1}a`.
 428     ///
 429     /// To write a literal `$` use `$$`.
 430     ///
 431     /// # Examples
 432     ///
 433     /// Note that this function is polymorphic with respect to the replacement.
 434     /// In typical usage, this can just be a normal string:
 435     ///
 436     /// ```rust
 437     /// # extern crate regex; use regex::Regex;
 438     /// # fn main() {
 439     /// let re = Regex::new("[^01]+").unwrap();
 440     /// assert_eq!(re.replace("1078910", ""), "1010");
 441     /// # }
 442     /// ```
 443     ///
 444     /// But anything satisfying the `Replacer` trait will work. For example,
 445     /// a closure of type `|&Captures| -> String` provides direct access to the
 446     /// captures corresponding to a match. This allows one to access
 447     /// capturing group matches easily:
 448     ///
 449     /// ```rust
 450     /// # extern crate regex; use regex::Regex;
 451     /// # use regex::Captures; fn main() {
 452     /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
 453     /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
 454     ///     format!("{} {}", &caps[2], &caps[1])
 455     /// });
 456     /// assert_eq!(result, "Bruce Springsteen");
 457     /// # }
 458     /// ```
 459     ///
 460     /// But this is a bit cumbersome to use all the time. Instead, a simple
 461     /// syntax is supported that expands `$name` into the corresponding capture
 462     /// group. Here's the last example, but using this expansion technique
 463     /// with named capture groups:
 464     ///
 465     /// ```rust
 466     /// # extern crate regex; use regex::Regex;
 467     /// # fn main() {
 468     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
 469     /// let result = re.replace("Springsteen, Bruce", "$first $last");
 470     /// assert_eq!(result, "Bruce Springsteen");
 471     /// # }
 472     /// ```
 473     ///
 474     /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
 475     /// would produce the same result. To write a literal `$` use `$$`.
 476     ///
 477     /// Sometimes the replacement string requires use of curly braces to
 478     /// delineate a capture group replacement and surrounding literal text.
 479     /// For example, if we wanted to join two words together with an
 480     /// underscore:
 481     ///
 482     /// ```rust
 483     /// # extern crate regex; use regex::Regex;
 484     /// # fn main() {
 485     /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
 486     /// let result = re.replace("deep fried", "${first}_$second");
 487     /// assert_eq!(result, "deep_fried");
 488     /// # }
 489     /// ```
 490     ///
 491     /// Without the curly braces, the capture group name `first_` would be
 492     /// used, and since it doesn't exist, it would be replaced with the empty
 493     /// string.
 494     ///
 495     /// Finally, sometimes you just want to replace a literal string with no
 496     /// regard for capturing group expansion. This can be done by wrapping a
 497     /// byte string with `NoExpand`:
 498     ///
 499     /// ```rust
 500     /// # extern crate regex; use regex::Regex;
 501     /// # fn main() {
 502     /// use regex::NoExpand;
 503     ///
 504     /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
 505     /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
 506     /// assert_eq!(result, "$2 $last");
 507     /// # }
 508     /// ```
 509     pub fn replace<'t, R: Replacer>(
 510         &self,
 511         text: &'t str,
 512         rep: R,
 513     ) -> Cow<'t, str> {
 514         self.replacen(text, 1, rep)
 515     }
 516
 517     /// Replaces all non-overlapping matches in `text` with the replacement
 518     /// provided. This is the same as calling `replacen` with `limit` set to
 519     /// `0`.
 520     ///
 521     /// See the documentation for `replace` for details on how to access
 522     /// capturing group matches in the replacement string.
 523     pub fn replace_all<'t, R: Replacer>(
 524         &self,
 525         text: &'t str,
 526         rep: R,
 527     ) -> Cow<'t, str> {
 528         self.replacen(text, 0, rep)
 529     }
 530
 531     /// Replaces at most `limit` non-overlapping matches in `text` with the
 532     /// replacement provided. If `limit` is 0, then all non-overlapping matches
 533     /// are replaced.
 534     ///
 535     /// See the documentation for `replace` for details on how to access
 536     /// capturing group matches in the replacement string.
 537     pub fn replacen<'t, R: Replacer>(
 538         &self,
 539         text: &'t str,
 540         limit: usize,
 541         mut rep: R,
 542     ) -> Cow<'t, str> {
 543         // If we know that the replacement doesn't have any capture expansions,
 544         // then we can fast path. The fast path can make a tremendous
 545         // difference:
 546         //
 547         //   1) We use `find_iter` instead of `captures_iter`. Not asking for
 548         //      captures generally makes the regex engines faster.
 549         //   2) We don't need to look up all of the capture groups and do
 550         //      replacements inside the replacement string. We just push it
 551         //      at each match and be done with it.
 552         if let Some(rep) = rep.no_expansion() {
 553             let mut it = self.find_iter(text).enumerate().peekable();
 554             if it.peek().is_none() {
 555                 return Cow::Borrowed(text);
 556             }
 557             let mut new = String::with_capacity(text.len());
 558             let mut last_match = 0;
 559             for (i, m) in it {
 560                 if limit > 0 && i >= limit {
 561                     break
 562                 }
 563                 new.push_str(&text[last_match..m.start()]);
 564                 new.push_str(&rep);
 565                 last_match = m.end();
 566             }
 567             new.push_str(&text[last_match..]);
 568             return Cow::Owned(new);
 569         }
 570
 571         // The slower path, which we use if the replacement needs access to
 572         // capture groups.
 573         let mut it = self.captures_iter(text).enumerate().peekable();
 574         if it.peek().is_none() {
 575             return Cow::Borrowed(text);
 576         }
 577         let mut new = String::with_capacity(text.len());
 578         let mut last_match = 0;
 579         for (i, cap) in it {
 580             if limit > 0 && i >= limit {
 581                 break
 582             }
 583             // unwrap on 0 is OK because captures only reports matches
 584             let m = cap.get(0).unwrap();
 585             new.push_str(&text[last_match..m.start()]);
 586             rep.replace_append(&cap, &mut new);
 587             last_match = m.end();
 588         }
 589         new.push_str(&text[last_match..]);
 590         Cow::Owned(new)
 591     }
 592 }
 593
 594 /// Advanced or "lower level" search methods.
 595 impl Regex {
 596     /// Returns the end location of a match in the text given.
 597     ///
 598     /// This method may have the same performance characteristics as
 599     /// `is_match`, except it provides an end location for a match. In
 600     /// particular, the location returned *may be shorter* than the proper end
 601     /// of the leftmost-first match.
 602     ///
 603     /// # Example
 604     ///
 605     /// Typically, `a+` would match the entire first sequence of `a` in some
 606     /// text, but `shortest_match` can give up as soon as it sees the first
 607     /// `a`.
 608     ///
 609     /// ```rust
 610     /// # extern crate regex; use regex::Regex;
 611     /// # fn main() {
 612     /// let text = "aaaaa";
 613     /// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
 614     /// assert_eq!(pos, Some(1));
 615     /// # }
 616     /// ```
 617     pub fn shortest_match(&self, text: &str) -> Option<usize> {
 618         self.shortest_match_at(text, 0)
 619     }
 620
 621     /// Returns the same as shortest_match, but starts the search at the given
 622     /// offset.
 623     ///
 624     /// The significance of the starting point is that it takes the surrounding
 625     /// context into consideration. For example, the `\A` anchor can only
 626     /// match when `start == 0`.
 627     #[doc(hidden)]
 628     pub fn shortest_match_at(
 629         &self,
 630         text: &str,
 631         start: usize,
 632     ) -> Option<usize> {
 633         self.0.searcher_str().shortest_match_at(text, start)
 634     }
 635
 636     /// Returns the same as is_match, but starts the search at the given
 637     /// offset.
 638     ///
 639     /// The significance of the starting point is that it takes the surrounding
 640     /// context into consideration. For example, the `\A` anchor can only
 641     /// match when `start == 0`.
 642     #[doc(hidden)]
 643     pub fn is_match_at(&self, text: &str, start: usize) -> bool {
 644         self.shortest_match_at(text, start).is_some()
 645     }
 646
 647     /// Returns the same as find, but starts the search at the given
 648     /// offset.
 649     ///
 650     /// The significance of the starting point is that it takes the surrounding
 651     /// context into consideration. For example, the `\A` anchor can only
 652     /// match when `start == 0`.
 653     #[doc(hidden)]
 654     pub fn find_at<'t>(
 655         &self,
 656         text: &'t str,
 657         start: usize,
 658     ) -> Option<Match<'t>> {
 659         self.0.searcher_str().find_at(text, start).map(|(s, e)| {
 660             Match::new(text, s, e)
 661         })
 662     }
 663
 664     /// Returns the same as captures, but starts the search at the given
 665     /// offset and populates the capture locations given.
 666     ///
 667     /// The significance of the starting point is that it takes the surrounding
 668     /// context into consideration. For example, the `\A` anchor can only
 669     /// match when `start == 0`.
 670     #[doc(hidden)]
 671     pub fn read_captures_at<'t>(
 672         &self,
 673         locs: &mut Locations,
 674         text: &'t str,
 675         start: usize,
 676     ) -> Option<Match<'t>> {
 677         self.0
 678             .searcher_str()
 679             .read_captures_at(locs, text, start)
 680             .map(|(s, e)| Match::new(text, s, e))
 681     }
 682 }
 683
 684 /// Auxiliary methods.
 685 impl Regex {
 686     /// Returns the original string of this regex.
 687     pub fn as_str(&self) -> &str {
 688         &self.0.regex_strings()[0]
 689     }
 690
 691     /// Returns an iterator over the capture names.
 692     pub fn capture_names(&self) -> CaptureNames {
 693         CaptureNames(self.0.capture_names().iter())
 694     }
 695
 696     /// Returns the number of captures.
 697     pub fn captures_len(&self) -> usize {
 698         self.0.capture_names().len()
 699     }
 700
 701     /// Returns an empty set of locations that can be reused in multiple calls
 702     /// to `read_captures`.
 703     #[doc(hidden)]
 704     pub fn locations(&self) -> Locations {
 705         self.0.searcher_str().locations()
 706     }
 707 }
 708
 709 /// An iterator over the names of all possible captures.
 710 ///
 711 /// `None` indicates an unnamed capture; the first element (capture 0, the
 712 /// whole matched region) is always unnamed.
 713 ///
 714 /// `'r` is the lifetime of the compiled regular expression.
 715 pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
 716
 717 impl<'r> Iterator for CaptureNames<'r> {
 718     type Item = Option<&'r str>;
 719
 720     fn next(&mut self) -> Option<Option<&'r str>> {
 721         self.0
 722             .next()
 723             .as_ref()
 724             .map(|slot| slot.as_ref().map(|name| name.as_ref()))
 725     }
 726
 727     fn size_hint(&self) -> (usize, Option<usize>) {
 728         self.0.size_hint()
 729     }
 730 }
 731
 732 /// Yields all substrings delimited by a regular expression match.
 733 ///
 734 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 735 /// lifetime of the string being split.
 736 pub struct Split<'r, 't> {
 737     finder: Matches<'r, 't>,
 738     last: usize,
 739 }
 740
 741 impl<'r, 't> Iterator for Split<'r, 't> {
 742     type Item = &'t str;
 743
 744     fn next(&mut self) -> Option<&'t str> {
 745         let text = self.finder.0.text();
 746         match self.finder.next() {
 747             None => {
 748                 if self.last >= text.len() {
 749                     None
 750                 } else {
 751                     let s = &text[self.last..];
 752                     self.last = text.len();
 753                     Some(s)
 754                 }
 755             }
 756             Some(m) => {
 757                 let matched = &text[self.last..m.start()];
 758                 self.last = m.end();
 759                 Some(matched)
 760             }
 761         }
 762     }
 763 }
 764
 765 /// Yields at most `N` substrings delimited by a regular expression match.
 766 ///
 767 /// The last substring will be whatever remains after splitting.
 768 ///
 769 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 770 /// lifetime of the string being split.
 771 pub struct SplitN<'r, 't> {
 772     splits: Split<'r, 't>,
 773     n: usize,
 774 }
 775
 776 impl<'r, 't> Iterator for SplitN<'r, 't> {
 777     type Item = &'t str;
 778
 779     fn next(&mut self) -> Option<&'t str> {
 780         if self.n == 0 {
 781             return None
 782         }
 783         self.n -= 1;
 784         if self.n == 0 {
 785             let text = self.splits.finder.0.text();
 786             Some(&text[self.splits.last..])
 787         } else {
 788             self.splits.next()
 789         }
 790     }
 791 }
 792
 793 /// Captures represents a group of captured strings for a single match.
 794 ///
 795 /// The 0th capture always corresponds to the entire match. Each subsequent
 796 /// index corresponds to the next capture group in the regex. If a capture
 797 /// group is named, then the matched string is *also* available via the `name`
 798 /// method. (Note that the 0th capture is always unnamed and so must be
 799 /// accessed with the `get` method.)
 800 ///
 801 /// Positions returned from a capture group are always byte indices.
 802 ///
 803 /// `'t` is the lifetime of the matched text.
 804 pub struct Captures<'t> {
 805     text: &'t str,
 806     locs: Locations,
 807     named_groups: Arc<HashMap<String, usize>>,
 808 }
 809
 810 impl<'t> Captures<'t> {
 811     /// Returns the match associated with the capture group at index `i`. If
 812     /// `i` does not correspond to a capture group, or if the capture group
 813     /// did not participate in the match, then `None` is returned.
 814     ///
 815     /// # Examples
 816     ///
 817     /// Get the text of the match with a default of an empty string if this
 818     /// group didn't participate in the match:
 819     ///
 820     /// ```rust
 821     /// # use regex::Regex;
 822     /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap();
 823     /// let caps = re.captures("abc123").unwrap();
 824     ///
 825     /// let text1 = caps.get(1).map_or("", |m| m.as_str());
 826     /// let text2 = caps.get(2).map_or("", |m| m.as_str());
 827     /// assert_eq!(text1, "123");
 828     /// assert_eq!(text2, "");
 829     /// ```
 830     pub fn get(&self, i: usize) -> Option<Match<'t>> {
 831         self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e))
 832     }
 833
 834     /// Returns the match for the capture group named `name`. If `name` isn't a
 835     /// valid capture group or didn't match anything, then `None` is returned.
 836     pub fn name(&self, name: &str) -> Option<Match<'t>> {
 837         self.named_groups.get(name).and_then(|&i| self.get(i))
 838     }
 839
 840     /// An iterator that yields all capturing matches in the order in which
 841     /// they appear in the regex. If a particular capture group didn't
 842     /// participate in the match, then `None` is yielded for that capture.
 843     ///
 844     /// The first match always corresponds to the overall match of the regex.
 845     pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
 846         SubCaptureMatches {
 847             caps: self,
 848             it: self.locs.iter(),
 849         }
 850     }
 851
 852     /// Expands all instances of `$name` in `replacement` to the corresponding
 853     /// capture group `name`, and writes them to the `dst` buffer given.
 854     ///
 855     /// `name` may be an integer corresponding to the index of the
 856     /// capture group (counted by order of opening parenthesis where `0` is the
 857     /// entire match) or it can be a name (consisting of letters, digits or
 858     /// underscores) corresponding to a named capture group.
 859     ///
 860     /// If `name` isn't a valid capture group (whether the name doesn't exist
 861     /// or isn't a valid index), then it is replaced with the empty string.
 862     ///
 863     /// The longest possible name is used. e.g., `$1a` looks up the capture
 864     /// group named `1a` and not the capture group at index `1`. To exert more
 865     /// precise control over the name, use braces, e.g., `${1}a`.
 866     ///
 867     /// To write a literal `$` use `$$`.
 868     pub fn expand(&self, replacement: &str, dst: &mut String) {
 869         expand_str(self, replacement, dst)
 870     }
 871
 872     /// Returns the number of captured groups.
 873     ///
 874     /// This is always at least `1`, since every regex has at least one capture
 875     /// group that corresponds to the full match.
 876     #[inline]
 877     pub fn len(&self) -> usize {
 878         self.locs.len()
 879     }
 880 }
 881
 882 impl<'t> fmt::Debug for Captures<'t> {
 883     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 884         f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
 885     }
 886 }
 887
 888 struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
 889
 890 impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
 891     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 892         // We'd like to show something nice here, even if it means an
 893         // allocation to build a reverse index.
 894         let slot_to_name: HashMap<&usize, &String> =
 895             self.0.named_groups.iter().map(|(a, b)| (b, a)).collect();
 896         let mut map = f.debug_map();
 897         for (slot, m) in self.0.locs.iter().enumerate() {
 898             let m = m.map(|(s, e)| &self.0.text[s..e]);
 899             if let Some(name) = slot_to_name.get(&slot) {
 900                 map.entry(&name, &m);
 901             } else {
 902                 map.entry(&slot, &m);
 903             }
 904         }
 905         map.finish()
 906     }
 907 }
 908
 909 /// Get a group by index.
 910 ///
 911 /// `'t` is the lifetime of the matched text.
 912 ///
 913 /// The text can't outlive the `Captures` object if this method is
 914 /// used, because of how `Index` is defined (normally `a[i]` is part
 915 /// of `a` and can't outlive it); to do that, use `get()` instead.
 916 ///
 917 /// # Panics
 918 ///
 919 /// If there is no group at the given index.
 920 impl<'t> Index<usize> for Captures<'t> {
 921     type Output = str;
 922
 923     fn index(&self, i: usize) -> &str {
 924         self.get(i).map(|m| m.as_str())
 925             .unwrap_or_else(|| panic!("no group at index '{}'", i))
 926     }
 927 }
 928
 929 /// Get a group by name.
 930 ///
 931 /// `'t` is the lifetime of the matched text and `'i` is the lifetime
 932 /// of the group name (the index).
 933 ///
 934 /// The text can't outlive the `Captures` object if this method is
 935 /// used, because of how `Index` is defined (normally `a[i]` is part
 936 /// of `a` and can't outlive it); to do that, use `name` instead.
 937 ///
 938 /// # Panics
 939 ///
 940 /// If there is no group named by the given value.
 941 impl<'t, 'i> Index<&'i str> for Captures<'t> {
 942     type Output = str;
 943
 944     fn index<'a>(&'a self, name: &'i str) -> &'a str {
 945         self.name(name).map(|m| m.as_str())
 946             .unwrap_or_else(|| panic!("no group named '{}'", name))
 947     }
 948 }
 949
 950 /// An iterator that yields all capturing matches in the order in which they
 951 /// appear in the regex.
 952 ///
 953 /// If a particular capture group didn't participate in the match, then `None`
 954 /// is yielded for that capture. The first match always corresponds to the
 955 /// overall match of the regex.
 956 ///
 957 /// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
 958 /// the lifetime `'t` corresponds to the originally matched text.
 959 pub struct SubCaptureMatches<'c, 't: 'c> {
 960     caps: &'c Captures<'t>,
 961     it: SubCapturesPosIter<'c>,
 962 }
 963
 964 impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
 965     type Item = Option<Match<'t>>;
 966
 967     fn next(&mut self) -> Option<Option<Match<'t>>> {
 968         self.it.next()
 969             .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e)))
 970     }
 971 }
 972
 973 /// An iterator that yields all non-overlapping capture groups matching a
 974 /// particular regular expression.
 975 ///
 976 /// The iterator stops when no more matches can be found.
 977 ///
 978 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
 979 /// lifetime of the matched string.
 980 pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>);
 981
 982 impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
 983     type Item = Captures<'t>;
 984
 985     fn next(&mut self) -> Option<Captures<'t>> {
 986         self.0.next().map(|locs| Captures {
 987             text: self.0.text(),
 988             locs: locs,
 989             named_groups: self.0.regex().capture_name_idx().clone(),
 990         })
 991     }
 992 }
 993
 994 /// An iterator over all non-overlapping matches for a particular string.
 995 ///
 996 /// The iterator yields a `Match` value. The iterator stops when no more
 997 /// matches can be found.
 998 ///
 999 /// `'r` is the lifetime of the compiled regular expression and `'t` is the
1000 /// lifetime of the matched string.
1001 pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
1002
1003 impl<'r, 't> Iterator for Matches<'r, 't> {
1004     type Item = Match<'t>;
1005
1006     fn next(&mut self) -> Option<Match<'t>> {
1007         let text = self.0.text();
1008         self.0.next().map(|(s, e)| Match::new(text, s, e))
1009     }
1010 }
1011
1012 /// Replacer describes types that can be used to replace matches in a string.
1013 ///
1014 /// In general, users of this crate shouldn't need to implement this trait,
1015 /// since implementations are already provided for `&str` and
1016 /// `FnMut(&Captures) -> String`, which covers most use cases.
1017 pub trait Replacer {
1018     /// Appends text to `dst` to replace the current match.
1019     ///
1020     /// The current match is represented by `caps`, which is guaranteed to
1021     /// have a match at capture group `0`.
1022     ///
1023     /// For example, a no-op replacement would be
1024     /// `dst.extend(caps.get(0).unwrap().as_str())`.
1025     fn replace_append(&mut self, caps: &Captures, dst: &mut String);
1026
1027     /// Return a fixed unchanging replacement string.
1028     ///
1029     /// When doing replacements, if access to `Captures` is not needed (e.g.,
1030     /// the replacement byte string does not need `$` expansion), then it can
1031     /// be beneficial to avoid finding sub-captures.
1032     ///
1033     /// In general, this is called once for every call to `replacen`.
1034     fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> {
1035         None
1036     }
1037 }
1038
1039 impl<'a> Replacer for &'a str {
1040     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1041         caps.expand(*self, dst);
1042     }
1043
1044     fn no_expansion(&mut self) -> Option<Cow<str>> {
1045         match memchr(b'$', self.as_bytes()) {
1046             Some(_) => None,
1047             None => Some(Cow::Borrowed(*self)),
1048         }
1049     }
1050 }
1051
1052 impl<F> Replacer for F where F: FnMut(&Captures) -> String {
1053     fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
1054         dst.push_str(&(*self)(caps));
1055     }
1056 }
1057
1058 /// `NoExpand` indicates literal string replacement.
1059 ///
1060 /// It can be used with `replace` and `replace_all` to do a literal string
1061 /// replacement without expanding `$name` to their corresponding capture
1062 /// groups. This can be both convenient (to avoid escaping `$`, for example)
1063 /// and performant (since capture groups don't need to be found).
1064 ///
1065 /// `'t` is the lifetime of the literal text.
1066 pub struct NoExpand<'t>(pub &'t str);
1067
1068 impl<'t> Replacer for NoExpand<'t> {
1069     fn replace_append(&mut self, _: &Captures, dst: &mut String) {
1070         dst.push_str(self.0);
1071     }
1072
1073     fn no_expansion(&mut self) -> Option<Cow<str>> {
1074         Some(Cow::Borrowed(self.0))
1075     }
1076 }