]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | use ast; |
2 | use hir; | |
3 | ||
4 | use Result; | |
5 | ||
6 | /// A builder for a regular expression parser. | |
7 | /// | |
8 | /// This builder permits modifying configuration options for the parser. | |
9 | /// | |
10 | /// This type combines the builder options for both the | |
11 | /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) | |
12 | /// and the | |
13 | /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). | |
14 | #[derive(Clone, Debug, Default)] | |
15 | pub struct ParserBuilder { | |
16 | ast: ast::parse::ParserBuilder, | |
17 | hir: hir::translate::TranslatorBuilder, | |
18 | } | |
19 | ||
20 | impl ParserBuilder { | |
21 | /// Create a new parser builder with a default configuration. | |
22 | pub fn new() -> ParserBuilder { | |
23 | ParserBuilder::default() | |
24 | } | |
25 | ||
26 | /// Build a parser from this configuration with the given pattern. | |
27 | pub fn build(&self) -> Parser { | |
f9f354fc | 28 | Parser { ast: self.ast.build(), hir: self.hir.build() } |
0531ce1d XL |
29 | } |
30 | ||
31 | /// Set the nesting limit for this parser. | |
32 | /// | |
33 | /// The nesting limit controls how deep the abstract syntax tree is allowed | |
34 | /// to be. If the AST exceeds the given limit (e.g., with too many nested | |
35 | /// groups), then an error is returned by the parser. | |
36 | /// | |
37 | /// The purpose of this limit is to act as a heuristic to prevent stack | |
38 | /// overflow for consumers that do structural induction on an `Ast` using | |
39 | /// explicit recursion. While this crate never does this (instead using | |
40 | /// constant stack space and moving the call stack to the heap), other | |
41 | /// crates may. | |
42 | /// | |
43 | /// This limit is not checked until the entire Ast is parsed. Therefore, | |
44 | /// if callers want to put a limit on the amount of heap space used, then | |
45 | /// they should impose a limit on the length, in bytes, of the concrete | |
46 | /// pattern string. In particular, this is viable since this parser | |
47 | /// implementation will limit itself to heap space proportional to the | |
48 | /// lenth of the pattern string. | |
49 | /// | |
50 | /// Note that a nest limit of `0` will return a nest limit error for most | |
51 | /// patterns but not all. For example, a nest limit of `0` permits `a` but | |
52 | /// not `ab`, since `ab` requires a concatenation, which results in a nest | |
53 | /// depth of `1`. In general, a nest limit is not something that manifests | |
54 | /// in an obvious way in the concrete syntax, therefore, it should not be | |
55 | /// used in a granular way. | |
56 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { | |
57 | self.ast.nest_limit(limit); | |
58 | self | |
59 | } | |
60 | ||
61 | /// Whether to support octal syntax or not. | |
62 | /// | |
63 | /// Octal syntax is a little-known way of uttering Unicode codepoints in | |
64 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and | |
65 | /// `\141` are all equivalent regular expressions, where the last example | |
66 | /// shows octal syntax. | |
67 | /// | |
68 | /// While supporting octal syntax isn't in and of itself a problem, it does | |
69 | /// make good error messages harder. That is, in PCRE based regex engines, | |
70 | /// syntax like `\0` invokes a backreference, which is explicitly | |
71 | /// unsupported in Rust's regex engine. However, many users expect it to | |
72 | /// be supported. Therefore, when octal support is disabled, the error | |
73 | /// message will explicitly mention that backreferences aren't supported. | |
74 | /// | |
75 | /// Octal syntax is disabled by default. | |
76 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { | |
77 | self.ast.octal(yes); | |
78 | self | |
79 | } | |
80 | ||
81 | /// When enabled, the parser will permit the construction of a regular | |
82 | /// expression that may match invalid UTF-8. | |
83 | /// | |
84 | /// When disabled (the default), the parser is guaranteed to produce | |
85 | /// an expression that will only ever match valid UTF-8 (otherwise, the | |
86 | /// parser will return an error). | |
87 | /// | |
b7449926 XL |
88 | /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII |
89 | /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause | |
90 | /// the parser to return an error. Namely, a negated ASCII word boundary | |
91 | /// can result in matching positions that aren't valid UTF-8 boundaries. | |
0531ce1d XL |
92 | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { |
93 | self.hir.allow_invalid_utf8(yes); | |
94 | self | |
95 | } | |
96 | ||
97 | /// Enable verbose mode in the regular expression. | |
98 | /// | |
99 | /// When enabled, verbose mode permits insigificant whitespace in many | |
100 | /// places in the regular expression, as well as comments. Comments are | |
101 | /// started using `#` and continue until the end of the line. | |
102 | /// | |
103 | /// By default, this is disabled. It may be selectively enabled in the | |
104 | /// regular expression by using the `x` flag regardless of this setting. | |
105 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { | |
106 | self.ast.ignore_whitespace(yes); | |
107 | self | |
108 | } | |
109 | ||
110 | /// Enable or disable the case insensitive flag by default. | |
111 | /// | |
112 | /// By default this is disabled. It may alternatively be selectively | |
113 | /// enabled in the regular expression itself via the `i` flag. | |
114 | pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { | |
115 | self.hir.case_insensitive(yes); | |
116 | self | |
117 | } | |
118 | ||
119 | /// Enable or disable the multi-line matching flag by default. | |
120 | /// | |
121 | /// By default this is disabled. It may alternatively be selectively | |
122 | /// enabled in the regular expression itself via the `m` flag. | |
123 | pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { | |
124 | self.hir.multi_line(yes); | |
125 | self | |
126 | } | |
127 | ||
128 | /// Enable or disable the "dot matches any character" flag by default. | |
129 | /// | |
130 | /// By default this is disabled. It may alternatively be selectively | |
131 | /// enabled in the regular expression itself via the `s` flag. | |
f9f354fc | 132 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { |
0531ce1d XL |
133 | self.hir.dot_matches_new_line(yes); |
134 | self | |
135 | } | |
136 | ||
137 | /// Enable or disable the "swap greed" flag by default. | |
138 | /// | |
139 | /// By default this is disabled. It may alternatively be selectively | |
140 | /// enabled in the regular expression itself via the `U` flag. | |
141 | pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { | |
142 | self.hir.swap_greed(yes); | |
143 | self | |
144 | } | |
145 | ||
146 | /// Enable or disable the Unicode flag (`u`) by default. | |
147 | /// | |
148 | /// By default this is **enabled**. It may alternatively be selectively | |
149 | /// disabled in the regular expression itself via the `u` flag. | |
150 | /// | |
151 | /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by | |
152 | /// default), a regular expression will fail to parse if Unicode mode is | |
153 | /// disabled and a sub-expression could possibly match invalid UTF-8. | |
154 | pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { | |
155 | self.hir.unicode(yes); | |
156 | self | |
157 | } | |
158 | } | |
159 | ||
160 | /// A convenience parser for regular expressions. | |
161 | /// | |
162 | /// This parser takes as input a regular expression pattern string (the | |
163 | /// "concrete syntax") and returns a high-level intermediate representation | |
164 | /// (the HIR) suitable for most types of analysis. In particular, this parser | |
165 | /// hides the intermediate state of producing an AST (the "abstract syntax"). | |
166 | /// The AST is itself far more complex than the HIR, so this parser serves as a | |
167 | /// convenience for never having to deal with it at all. | |
168 | /// | |
169 | /// If callers have more fine grained use cases that need an AST, then please | |
170 | /// see the [`ast::parse`](ast/parse/index.html) module. | |
171 | /// | |
172 | /// A `Parser` can be configured in more detail via a | |
173 | /// [`ParserBuilder`](struct.ParserBuilder.html). | |
174 | #[derive(Clone, Debug)] | |
175 | pub struct Parser { | |
176 | ast: ast::parse::Parser, | |
177 | hir: hir::translate::Translator, | |
178 | } | |
179 | ||
180 | impl Parser { | |
181 | /// Create a new parser with a default configuration. | |
182 | /// | |
183 | /// The parser can be run with `parse` method. The parse method returns | |
184 | /// a high level intermediate representation of the given regular | |
185 | /// expression. | |
186 | /// | |
187 | /// To set configuration options on the parser, use | |
188 | /// [`ParserBuilder`](struct.ParserBuilder.html). | |
189 | pub fn new() -> Parser { | |
190 | ParserBuilder::new().build() | |
191 | } | |
192 | ||
193 | /// Parse the regular expression into a high level intermediate | |
194 | /// representation. | |
195 | pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> { | |
94b46f34 XL |
196 | let ast = self.ast.parse(pattern)?; |
197 | let hir = self.hir.translate(pattern, &ast)?; | |
0531ce1d XL |
198 | Ok(hir) |
199 | } | |
200 | } |