vendor/regex-syntax/src/parser.rs

   1 use crate::ast;
   2 use crate::hir;
   3
   4 use crate::Result;
   5
   6 /// A builder for a regular expression parser.
   7 ///
   8 /// This builder permits modifying configuration options for the parser.
   9 ///
  10 /// This type combines the builder options for both the
  11 /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
  12 /// and the
  13 /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
  14 #[derive(Clone, Debug, Default)]
  15 pub struct ParserBuilder {
  16     ast: ast::parse::ParserBuilder,
  17     hir: hir::translate::TranslatorBuilder,
  18 }
  19
  20 impl ParserBuilder {
  21     /// Create a new parser builder with a default configuration.
  22     pub fn new() -> ParserBuilder {
  23         ParserBuilder::default()
  24     }
  25
  26     /// Build a parser from this configuration with the given pattern.
  27     pub fn build(&self) -> Parser {
  28         Parser { ast: self.ast.build(), hir: self.hir.build() }
  29     }
  30
  31     /// Set the nesting limit for this parser.
  32     ///
  33     /// The nesting limit controls how deep the abstract syntax tree is allowed
  34     /// to be. If the AST exceeds the given limit (e.g., with too many nested
  35     /// groups), then an error is returned by the parser.
  36     ///
  37     /// The purpose of this limit is to act as a heuristic to prevent stack
  38     /// overflow for consumers that do structural induction on an `Ast` using
  39     /// explicit recursion. While this crate never does this (instead using
  40     /// constant stack space and moving the call stack to the heap), other
  41     /// crates may.
  42     ///
  43     /// This limit is not checked until the entire Ast is parsed. Therefore,
  44     /// if callers want to put a limit on the amount of heap space used, then
  45     /// they should impose a limit on the length, in bytes, of the concrete
  46     /// pattern string. In particular, this is viable since this parser
  47     /// implementation will limit itself to heap space proportional to the
  48     /// length of the pattern string.
  49     ///
  50     /// Note that a nest limit of `0` will return a nest limit error for most
  51     /// patterns but not all. For example, a nest limit of `0` permits `a` but
  52     /// not `ab`, since `ab` requires a concatenation, which results in a nest
  53     /// depth of `1`. In general, a nest limit is not something that manifests
  54     /// in an obvious way in the concrete syntax, therefore, it should not be
  55     /// used in a granular way.
  56     pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
  57         self.ast.nest_limit(limit);
  58         self
  59     }
  60
  61     /// Whether to support octal syntax or not.
  62     ///
  63     /// Octal syntax is a little-known way of uttering Unicode codepoints in
  64     /// a regular expression. For example, `a`, `\x61`, `\u0061` and
  65     /// `\141` are all equivalent regular expressions, where the last example
  66     /// shows octal syntax.
  67     ///
  68     /// While supporting octal syntax isn't in and of itself a problem, it does
  69     /// make good error messages harder. That is, in PCRE based regex engines,
  70     /// syntax like `\0` invokes a backreference, which is explicitly
  71     /// unsupported in Rust's regex engine. However, many users expect it to
  72     /// be supported. Therefore, when octal support is disabled, the error
  73     /// message will explicitly mention that backreferences aren't supported.
  74     ///
  75     /// Octal syntax is disabled by default.
  76     pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
  77         self.ast.octal(yes);
  78         self
  79     }
  80
  81     /// When enabled, the parser will permit the construction of a regular
  82     /// expression that may match invalid UTF-8.
  83     ///
  84     /// When disabled (the default), the parser is guaranteed to produce
  85     /// an expression that will only ever match valid UTF-8 (otherwise, the
  86     /// parser will return an error).
  87     ///
  88     /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
  89     /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
  90     /// the parser to return an error. Namely, a negated ASCII word boundary
  91     /// can result in matching positions that aren't valid UTF-8 boundaries.
  92     pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
  93         self.hir.allow_invalid_utf8(yes);
  94         self
  95     }
  96
  97     /// Enable verbose mode in the regular expression.
  98     ///
  99     /// When enabled, verbose mode permits insignificant whitespace in many
 100     /// places in the regular expression, as well as comments. Comments are
 101     /// started using `#` and continue until the end of the line.
 102     ///
 103     /// By default, this is disabled. It may be selectively enabled in the
 104     /// regular expression by using the `x` flag regardless of this setting.
 105     pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
 106         self.ast.ignore_whitespace(yes);
 107         self
 108     }
 109
 110     /// Enable or disable the case insensitive flag by default.
 111     ///
 112     /// By default this is disabled. It may alternatively be selectively
 113     /// enabled in the regular expression itself via the `i` flag.
 114     pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
 115         self.hir.case_insensitive(yes);
 116         self
 117     }
 118
 119     /// Enable or disable the multi-line matching flag by default.
 120     ///
 121     /// By default this is disabled. It may alternatively be selectively
 122     /// enabled in the regular expression itself via the `m` flag.
 123     pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
 124         self.hir.multi_line(yes);
 125         self
 126     }
 127
 128     /// Enable or disable the "dot matches any character" flag by default.
 129     ///
 130     /// By default this is disabled. It may alternatively be selectively
 131     /// enabled in the regular expression itself via the `s` flag.
 132     pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
 133         self.hir.dot_matches_new_line(yes);
 134         self
 135     }
 136
 137     /// Enable or disable the "swap greed" flag by default.
 138     ///
 139     /// By default this is disabled. It may alternatively be selectively
 140     /// enabled in the regular expression itself via the `U` flag.
 141     pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
 142         self.hir.swap_greed(yes);
 143         self
 144     }
 145
 146     /// Enable or disable the Unicode flag (`u`) by default.
 147     ///
 148     /// By default this is **enabled**. It may alternatively be selectively
 149     /// disabled in the regular expression itself via the `u` flag.
 150     ///
 151     /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
 152     /// default), a regular expression will fail to parse if Unicode mode is
 153     /// disabled and a sub-expression could possibly match invalid UTF-8.
 154     pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
 155         self.hir.unicode(yes);
 156         self
 157     }
 158 }
 159
 160 /// A convenience parser for regular expressions.
 161 ///
 162 /// This parser takes as input a regular expression pattern string (the
 163 /// "concrete syntax") and returns a high-level intermediate representation
 164 /// (the HIR) suitable for most types of analysis. In particular, this parser
 165 /// hides the intermediate state of producing an AST (the "abstract syntax").
 166 /// The AST is itself far more complex than the HIR, so this parser serves as a
 167 /// convenience for never having to deal with it at all.
 168 ///
 169 /// If callers have more fine grained use cases that need an AST, then please
 170 /// see the [`ast::parse`](ast/parse/index.html) module.
 171 ///
 172 /// A `Parser` can be configured in more detail via a
 173 /// [`ParserBuilder`](struct.ParserBuilder.html).
 174 #[derive(Clone, Debug)]
 175 pub struct Parser {
 176     ast: ast::parse::Parser,
 177     hir: hir::translate::Translator,
 178 }
 179
 180 impl Parser {
 181     /// Create a new parser with a default configuration.
 182     ///
 183     /// The parser can be run with `parse` method. The parse method returns
 184     /// a high level intermediate representation of the given regular
 185     /// expression.
 186     ///
 187     /// To set configuration options on the parser, use
 188     /// [`ParserBuilder`](struct.ParserBuilder.html).
 189     pub fn new() -> Parser {
 190         ParserBuilder::new().build()
 191     }
 192
 193     /// Parse the regular expression into a high level intermediate
 194     /// representation.
 195     pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
 196         let ast = self.ast.parse(pattern)?;
 197         let hir = self.hir.translate(pattern, &ast)?;
 198         Ok(hir)
 199     }
 200 }