vendor/regex/src/re_builder.rs

   1 /// The set of user configurable options for compiling zero or more regexes.
   2 #[derive(Clone, Debug)]
   3 #[allow(missing_docs)]
   4 pub struct RegexOptions {
   5     pub pats: Vec<String>,
   6     pub size_limit: usize,
   7     pub dfa_size_limit: usize,
   8     pub nest_limit: u32,
   9     pub case_insensitive: bool,
  10     pub multi_line: bool,
  11     pub dot_matches_new_line: bool,
  12     pub swap_greed: bool,
  13     pub ignore_whitespace: bool,
  14     pub unicode: bool,
  15     pub octal: bool,
  16 }
  17
  18 impl Default for RegexOptions {
  19     fn default() -> Self {
  20         RegexOptions {
  21             pats: vec![],
  22             size_limit: 10 * (1 << 20),
  23             dfa_size_limit: 2 * (1 << 20),
  24             nest_limit: 250,
  25             case_insensitive: false,
  26             multi_line: false,
  27             dot_matches_new_line: false,
  28             swap_greed: false,
  29             ignore_whitespace: false,
  30             unicode: true,
  31             octal: false,
  32         }
  33     }
  34 }
  35
  36 macro_rules! define_builder {
  37     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
  38         pub mod $name {
  39             use super::RegexOptions;
  40             use crate::error::Error;
  41             use crate::exec::ExecBuilder;
  42
  43             use crate::$regex_mod::Regex;
  44
  45             /// A configurable builder for a regular expression.
  46             ///
  47             /// A builder can be used to configure how the regex is built, for example, by
  48             /// setting the default flags (which can be overridden in the expression
  49             /// itself) or setting various limits.
  50             #[derive(Debug)]
  51             pub struct RegexBuilder(RegexOptions);
  52
  53             impl RegexBuilder {
  54                 /// Create a new regular expression builder with the given pattern.
  55                 ///
  56                 /// If the pattern is invalid, then an error will be returned when
  57                 /// `build` is called.
  58                 pub fn new(pattern: &str) -> RegexBuilder {
  59                     let mut builder = RegexBuilder(RegexOptions::default());
  60                     builder.0.pats.push(pattern.to_owned());
  61                     builder
  62                 }
  63
  64                 /// Consume the builder and compile the regular expression.
  65                 ///
  66                 /// Note that calling `as_str` on the resulting `Regex` will produce the
  67                 /// pattern given to `new` verbatim. Notably, it will not incorporate any
  68                 /// of the flags set on this builder.
  69                 pub fn build(&self) -> Result<Regex, Error> {
  70                     ExecBuilder::new_options(self.0.clone())
  71                         .only_utf8($only_utf8)
  72                         .build()
  73                         .map(Regex::from)
  74                 }
  75
  76                 /// Set the value for the case insensitive (`i`) flag.
  77                 ///
  78                 /// When enabled, letters in the pattern will match both upper case and
  79                 /// lower case variants.
  80                 pub fn case_insensitive(
  81                     &mut self,
  82                     yes: bool,
  83                 ) -> &mut RegexBuilder {
  84                     self.0.case_insensitive = yes;
  85                     self
  86                 }
  87
  88                 /// Set the value for the multi-line matching (`m`) flag.
  89                 ///
  90                 /// When enabled, `^` matches the beginning of lines and `$` matches the
  91                 /// end of lines.
  92                 ///
  93                 /// By default, they match beginning/end of the input.
  94                 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
  95                     self.0.multi_line = yes;
  96                     self
  97                 }
  98
  99                 /// Set the value for the any character (`s`) flag, where in `.` matches
 100                 /// anything when `s` is set and matches anything except for new line when
 101                 /// it is not set (the default).
 102                 ///
 103                 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
 104                 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
 105                 /// Unicode is enabled.
 106                 pub fn dot_matches_new_line(
 107                     &mut self,
 108                     yes: bool,
 109                 ) -> &mut RegexBuilder {
 110                     self.0.dot_matches_new_line = yes;
 111                     self
 112                 }
 113
 114                 /// Set the value for the greedy swap (`U`) flag.
 115                 ///
 116                 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
 117                 /// match) and `a*?` is greedy (tries to find longest match).
 118                 ///
 119                 /// By default, `a*` is greedy and `a*?` is lazy.
 120                 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
 121                     self.0.swap_greed = yes;
 122                     self
 123                 }
 124
 125                 /// Set the value for the ignore whitespace (`x`) flag.
 126                 ///
 127                 /// When enabled, whitespace such as new lines and spaces will be ignored
 128                 /// between expressions of the pattern, and `#` can be used to start a
 129                 /// comment until the next new line.
 130                 pub fn ignore_whitespace(
 131                     &mut self,
 132                     yes: bool,
 133                 ) -> &mut RegexBuilder {
 134                     self.0.ignore_whitespace = yes;
 135                     self
 136                 }
 137
 138                 /// Set the value for the Unicode (`u`) flag.
 139                 ///
 140                 /// Enabled by default. When disabled, character classes such as `\w` only
 141                 /// match ASCII word characters instead of all Unicode word characters.
 142                 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
 143                     self.0.unicode = yes;
 144                     self
 145                 }
 146
 147                 /// Whether to support octal syntax or not.
 148                 ///
 149                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
 150                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
 151                 /// `\141` are all equivalent regular expressions, where the last example
 152                 /// shows octal syntax.
 153                 ///
 154                 /// While supporting octal syntax isn't in and of itself a problem, it does
 155                 /// make good error messages harder. That is, in PCRE based regex engines,
 156                 /// syntax like `\0` invokes a backreference, which is explicitly
 157                 /// unsupported in Rust's regex engine. However, many users expect it to
 158                 /// be supported. Therefore, when octal support is disabled, the error
 159                 /// message will explicitly mention that backreferences aren't supported.
 160                 ///
 161                 /// Octal syntax is disabled by default.
 162                 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
 163                     self.0.octal = yes;
 164                     self
 165                 }
 166
 167                 /// Set the approximate size limit of the compiled regular expression.
 168                 ///
 169                 /// This roughly corresponds to the number of bytes occupied by a single
 170                 /// compiled program. If the program exceeds this number, then a
 171                 /// compilation error is returned.
 172                 pub fn size_limit(
 173                     &mut self,
 174                     limit: usize,
 175                 ) -> &mut RegexBuilder {
 176                     self.0.size_limit = limit;
 177                     self
 178                 }
 179
 180                 /// Set the approximate size of the cache used by the DFA.
 181                 ///
 182                 /// This roughly corresponds to the number of bytes that the DFA will
 183                 /// use while searching.
 184                 ///
 185                 /// Note that this is a *per thread* limit. There is no way to set a global
 186                 /// limit. In particular, if a regex is used from multiple threads
 187                 /// simultaneously, then each thread may use up to the number of bytes
 188                 /// specified here.
 189                 pub fn dfa_size_limit(
 190                     &mut self,
 191                     limit: usize,
 192                 ) -> &mut RegexBuilder {
 193                     self.0.dfa_size_limit = limit;
 194                     self
 195                 }
 196
 197                 /// Set the nesting limit for this parser.
 198                 ///
 199                 /// The nesting limit controls how deep the abstract syntax tree is allowed
 200                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
 201                 /// groups), then an error is returned by the parser.
 202                 ///
 203                 /// The purpose of this limit is to act as a heuristic to prevent stack
 204                 /// overflow for consumers that do structural induction on an `Ast` using
 205                 /// explicit recursion. While this crate never does this (instead using
 206                 /// constant stack space and moving the call stack to the heap), other
 207                 /// crates may.
 208                 ///
 209                 /// This limit is not checked until the entire Ast is parsed. Therefore,
 210                 /// if callers want to put a limit on the amount of heap space used, then
 211                 /// they should impose a limit on the length, in bytes, of the concrete
 212                 /// pattern string. In particular, this is viable since this parser
 213                 /// implementation will limit itself to heap space proportional to the
 214                 /// length of the pattern string.
 215                 ///
 216                 /// Note that a nest limit of `0` will return a nest limit error for most
 217                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
 218                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
 219                 /// depth of `1`. In general, a nest limit is not something that manifests
 220                 /// in an obvious way in the concrete syntax, therefore, it should not be
 221                 /// used in a granular way.
 222                 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
 223                     self.0.nest_limit = limit;
 224                     self
 225                 }
 226             }
 227         }
 228     };
 229 }
 230
 231 define_builder!(bytes, re_bytes, false);
 232 define_builder!(unicode, re_unicode, true);
 233
 234 macro_rules! define_set_builder {
 235     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
 236         pub mod $name {
 237             use super::RegexOptions;
 238             use crate::error::Error;
 239             use crate::exec::ExecBuilder;
 240
 241             use crate::re_set::$regex_mod::RegexSet;
 242
 243             /// A configurable builder for a set of regular expressions.
 244             ///
 245             /// A builder can be used to configure how the regexes are built, for example,
 246             /// by setting the default flags (which can be overridden in the expression
 247             /// itself) or setting various limits.
 248             #[derive(Debug)]
 249             pub struct RegexSetBuilder(RegexOptions);
 250
 251             impl RegexSetBuilder {
 252                 /// Create a new regular expression builder with the given pattern.
 253                 ///
 254                 /// If the pattern is invalid, then an error will be returned when
 255                 /// `build` is called.
 256                 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
 257                 where
 258                     S: AsRef<str>,
 259                     I: IntoIterator<Item = S>,
 260                 {
 261                     let mut builder = RegexSetBuilder(RegexOptions::default());
 262                     for pat in patterns {
 263                         builder.0.pats.push(pat.as_ref().to_owned());
 264                     }
 265                     builder
 266                 }
 267
 268                 /// Consume the builder and compile the regular expressions into a set.
 269                 pub fn build(&self) -> Result<RegexSet, Error> {
 270                     ExecBuilder::new_options(self.0.clone())
 271                         .only_utf8($only_utf8)
 272                         .build()
 273                         .map(RegexSet::from)
 274                 }
 275
 276                 /// Set the value for the case insensitive (`i`) flag.
 277                 pub fn case_insensitive(
 278                     &mut self,
 279                     yes: bool,
 280                 ) -> &mut RegexSetBuilder {
 281                     self.0.case_insensitive = yes;
 282                     self
 283                 }
 284
 285                 /// Set the value for the multi-line matching (`m`) flag.
 286                 pub fn multi_line(
 287                     &mut self,
 288                     yes: bool,
 289                 ) -> &mut RegexSetBuilder {
 290                     self.0.multi_line = yes;
 291                     self
 292                 }
 293
 294                 /// Set the value for the any character (`s`) flag, where in `.` matches
 295                 /// anything when `s` is set and matches anything except for new line when
 296                 /// it is not set (the default).
 297                 ///
 298                 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
 299                 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
 300                 /// expressions.
 301                 pub fn dot_matches_new_line(
 302                     &mut self,
 303                     yes: bool,
 304                 ) -> &mut RegexSetBuilder {
 305                     self.0.dot_matches_new_line = yes;
 306                     self
 307                 }
 308
 309                 /// Set the value for the greedy swap (`U`) flag.
 310                 pub fn swap_greed(
 311                     &mut self,
 312                     yes: bool,
 313                 ) -> &mut RegexSetBuilder {
 314                     self.0.swap_greed = yes;
 315                     self
 316                 }
 317
 318                 /// Set the value for the ignore whitespace (`x`) flag.
 319                 pub fn ignore_whitespace(
 320                     &mut self,
 321                     yes: bool,
 322                 ) -> &mut RegexSetBuilder {
 323                     self.0.ignore_whitespace = yes;
 324                     self
 325                 }
 326
 327                 /// Set the value for the Unicode (`u`) flag.
 328                 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
 329                     self.0.unicode = yes;
 330                     self
 331                 }
 332
 333                 /// Whether to support octal syntax or not.
 334                 ///
 335                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
 336                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
 337                 /// `\141` are all equivalent regular expressions, where the last example
 338                 /// shows octal syntax.
 339                 ///
 340                 /// While supporting octal syntax isn't in and of itself a problem, it does
 341                 /// make good error messages harder. That is, in PCRE based regex engines,
 342                 /// syntax like `\0` invokes a backreference, which is explicitly
 343                 /// unsupported in Rust's regex engine. However, many users expect it to
 344                 /// be supported. Therefore, when octal support is disabled, the error
 345                 /// message will explicitly mention that backreferences aren't supported.
 346                 ///
 347                 /// Octal syntax is disabled by default.
 348                 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
 349                     self.0.octal = yes;
 350                     self
 351                 }
 352
 353                 /// Set the approximate size limit of the compiled regular expression.
 354                 ///
 355                 /// This roughly corresponds to the number of bytes occupied by a single
 356                 /// compiled program. If the program exceeds this number, then a
 357                 /// compilation error is returned.
 358                 pub fn size_limit(
 359                     &mut self,
 360                     limit: usize,
 361                 ) -> &mut RegexSetBuilder {
 362                     self.0.size_limit = limit;
 363                     self
 364                 }
 365
 366                 /// Set the approximate size of the cache used by the DFA.
 367                 ///
 368                 /// This roughly corresponds to the number of bytes that the DFA will
 369                 /// use while searching.
 370                 ///
 371                 /// Note that this is a *per thread* limit. There is no way to set a global
 372                 /// limit. In particular, if a regex is used from multiple threads
 373                 /// simultaneously, then each thread may use up to the number of bytes
 374                 /// specified here.
 375                 pub fn dfa_size_limit(
 376                     &mut self,
 377                     limit: usize,
 378                 ) -> &mut RegexSetBuilder {
 379                     self.0.dfa_size_limit = limit;
 380                     self
 381                 }
 382
 383                 /// Set the nesting limit for this parser.
 384                 ///
 385                 /// The nesting limit controls how deep the abstract syntax tree is allowed
 386                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
 387                 /// groups), then an error is returned by the parser.
 388                 ///
 389                 /// The purpose of this limit is to act as a heuristic to prevent stack
 390                 /// overflow for consumers that do structural induction on an `Ast` using
 391                 /// explicit recursion. While this crate never does this (instead using
 392                 /// constant stack space and moving the call stack to the heap), other
 393                 /// crates may.
 394                 ///
 395                 /// This limit is not checked until the entire Ast is parsed. Therefore,
 396                 /// if callers want to put a limit on the amount of heap space used, then
 397                 /// they should impose a limit on the length, in bytes, of the concrete
 398                 /// pattern string. In particular, this is viable since this parser
 399                 /// implementation will limit itself to heap space proportional to the
 400                 /// length of the pattern string.
 401                 ///
 402                 /// Note that a nest limit of `0` will return a nest limit error for most
 403                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
 404                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
 405                 /// depth of `1`. In general, a nest limit is not something that manifests
 406                 /// in an obvious way in the concrete syntax, therefore, it should not be
 407                 /// used in a granular way.
 408                 pub fn nest_limit(
 409                     &mut self,
 410                     limit: u32,
 411                 ) -> &mut RegexSetBuilder {
 412                     self.0.nest_limit = limit;
 413                     self
 414                 }
 415             }
 416         }
 417     };
 418 }
 419
 420 define_set_builder!(set_bytes, bytes, false);
 421 define_set_builder!(set_unicode, unicode, true);