]>
Commit | Line | Data |
---|---|---|
8bb4bdeb XL |
1 | // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | /// The set of user configurable options for compiling zero or more regexes. | |
12 | #[derive(Clone, Debug)] | |
13 | #[allow(missing_docs)] | |
14 | pub struct RegexOptions { | |
15 | pub pats: Vec<String>, | |
16 | pub size_limit: usize, | |
17 | pub dfa_size_limit: usize, | |
0531ce1d | 18 | pub nest_limit: u32, |
8bb4bdeb XL |
19 | pub case_insensitive: bool, |
20 | pub multi_line: bool, | |
21 | pub dot_matches_new_line: bool, | |
22 | pub swap_greed: bool, | |
23 | pub ignore_whitespace: bool, | |
24 | pub unicode: bool, | |
94b46f34 | 25 | pub octal: bool, |
8bb4bdeb XL |
26 | } |
27 | ||
28 | impl Default for RegexOptions { | |
29 | fn default() -> Self { | |
30 | RegexOptions { | |
31 | pats: vec![], | |
32 | size_limit: 10 * (1<<20), | |
33 | dfa_size_limit: 2 * (1<<20), | |
0531ce1d | 34 | nest_limit: 250, |
8bb4bdeb XL |
35 | case_insensitive: false, |
36 | multi_line: false, | |
37 | dot_matches_new_line: false, | |
38 | swap_greed: false, | |
39 | ignore_whitespace: false, | |
40 | unicode: true, | |
94b46f34 | 41 | octal: false, |
8bb4bdeb XL |
42 | } |
43 | } | |
44 | } | |
45 | ||
46 | macro_rules! define_builder { | |
47 | ($name:ident, $regex_mod:ident, $only_utf8:expr) => { | |
48 | pub mod $name { | |
49 | use error::Error; | |
50 | use exec::ExecBuilder; | |
51 | use super::RegexOptions; | |
52 | ||
53 | use $regex_mod::Regex; | |
54 | ||
55 | /// A configurable builder for a regular expression. | |
56 | /// | |
57 | /// A builder can be used to configure how the regex is built, for example, by | |
58 | /// setting the default flags (which can be overridden in the expression | |
59 | /// itself) or setting various limits. | |
60 | pub struct RegexBuilder(RegexOptions); | |
61 | ||
62 | impl RegexBuilder { | |
63 | /// Create a new regular expression builder with the given pattern. | |
64 | /// | |
65 | /// If the pattern is invalid, then an error will be returned when | |
ff7c6d11 | 66 | /// `build` is called. |
8bb4bdeb XL |
67 | pub fn new(pattern: &str) -> RegexBuilder { |
68 | let mut builder = RegexBuilder(RegexOptions::default()); | |
69 | builder.0.pats.push(pattern.to_owned()); | |
70 | builder | |
71 | } | |
72 | ||
73 | /// Consume the builder and compile the regular expression. | |
74 | /// | |
75 | /// Note that calling `as_str` on the resulting `Regex` will produce the | |
76 | /// pattern given to `new` verbatim. Notably, it will not incorporate any | |
77 | /// of the flags set on this builder. | |
78 | pub fn build(&self) -> Result<Regex, Error> { | |
79 | ExecBuilder::new_options(self.0.clone()) | |
80 | .only_utf8($only_utf8) | |
81 | .build() | |
82 | .map(Regex::from) | |
83 | } | |
84 | ||
85 | /// Set the value for the case insensitive (`i`) flag. | |
ff7c6d11 XL |
86 | /// |
87 | /// When enabled, letters in the pattern will match both upper case and | |
88 | /// lower case variants. | |
8bb4bdeb XL |
89 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
90 | self.0.case_insensitive = yes; | |
91 | self | |
92 | } | |
93 | ||
94 | /// Set the value for the multi-line matching (`m`) flag. | |
ff7c6d11 XL |
95 | /// |
96 | /// When enabled, `^` matches the beginning of lines and `$` matches the | |
97 | /// end of lines. | |
98 | /// | |
99 | /// By default, they match beginning/end of the input. | |
8bb4bdeb XL |
100 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
101 | self.0.multi_line = yes; | |
102 | self | |
103 | } | |
104 | ||
105 | /// Set the value for the any character (`s`) flag, where in `.` matches | |
106 | /// anything when `s` is set and matches anything except for new line when | |
107 | /// it is not set (the default). | |
108 | /// | |
109 | /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex` | |
110 | /// expressions and means "any Unicode scalar value" for `regex::Regex` | |
111 | /// expressions. | |
112 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { | |
113 | self.0.dot_matches_new_line = yes; | |
114 | self | |
115 | } | |
116 | ||
117 | /// Set the value for the greedy swap (`U`) flag. | |
ff7c6d11 XL |
118 | /// |
119 | /// When enabled, a pattern like `a*` is lazy (tries to find shortest | |
120 | /// match) and `a*?` is greedy (tries to find longest match). | |
121 | /// | |
122 | /// By default, `a*` is greedy and `a*?` is lazy. | |
8bb4bdeb XL |
123 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
124 | self.0.swap_greed = yes; | |
125 | self | |
126 | } | |
127 | ||
128 | /// Set the value for the ignore whitespace (`x`) flag. | |
ff7c6d11 XL |
129 | /// |
130 | /// When enabled, whitespace such as new lines and spaces will be ignored | |
131 | /// between expressions of the pattern, and `#` can be used to start a | |
132 | /// comment until the next new line. | |
8bb4bdeb XL |
133 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
134 | self.0.ignore_whitespace = yes; | |
135 | self | |
136 | } | |
137 | ||
138 | /// Set the value for the Unicode (`u`) flag. | |
ff7c6d11 XL |
139 | /// |
140 | /// Enabled by default. When disabled, character classes such as `\w` only | |
141 | /// match ASCII word characters instead of all Unicode word characters. | |
8bb4bdeb XL |
142 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
143 | self.0.unicode = yes; | |
144 | self | |
145 | } | |
146 | ||
94b46f34 XL |
147 | /// Whether to support octal syntax or not. |
148 | /// | |
149 | /// Octal syntax is a little-known way of uttering Unicode codepoints in | |
150 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and | |
151 | /// `\141` are all equivalent regular expressions, where the last example | |
152 | /// shows octal syntax. | |
153 | /// | |
154 | /// While supporting octal syntax isn't in and of itself a problem, it does | |
155 | /// make good error messages harder. That is, in PCRE based regex engines, | |
156 | /// syntax like `\0` invokes a backreference, which is explicitly | |
157 | /// unsupported in Rust's regex engine. However, many users expect it to | |
158 | /// be supported. Therefore, when octal support is disabled, the error | |
159 | /// message will explicitly mention that backreferences aren't supported. | |
160 | /// | |
161 | /// Octal syntax is disabled by default. | |
162 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { | |
163 | self.0.octal = yes; | |
164 | self | |
165 | } | |
166 | ||
8bb4bdeb XL |
167 | /// Set the approximate size limit of the compiled regular expression. |
168 | /// | |
169 | /// This roughly corresponds to the number of bytes occupied by a single | |
170 | /// compiled program. If the program exceeds this number, then a | |
171 | /// compilation error is returned. | |
172 | pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { | |
173 | self.0.size_limit = limit; | |
174 | self | |
175 | } | |
176 | ||
177 | /// Set the approximate size of the cache used by the DFA. | |
178 | /// | |
179 | /// This roughly corresponds to the number of bytes that the DFA will | |
180 | /// use while searching. | |
181 | /// | |
182 | /// Note that this is a *per thread* limit. There is no way to set a global | |
183 | /// limit. In particular, if a regex is used from multiple threads | |
ff7c6d11 | 184 | /// simultaneously, then each thread may use up to the number of bytes |
8bb4bdeb XL |
185 | /// specified here. |
186 | pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder { | |
187 | self.0.dfa_size_limit = limit; | |
188 | self | |
189 | } | |
0531ce1d XL |
190 | |
191 | /// Set the nesting limit for this parser. | |
192 | /// | |
193 | /// The nesting limit controls how deep the abstract syntax tree is allowed | |
194 | /// to be. If the AST exceeds the given limit (e.g., with too many nested | |
195 | /// groups), then an error is returned by the parser. | |
196 | /// | |
197 | /// The purpose of this limit is to act as a heuristic to prevent stack | |
198 | /// overflow for consumers that do structural induction on an `Ast` using | |
199 | /// explicit recursion. While this crate never does this (instead using | |
200 | /// constant stack space and moving the call stack to the heap), other | |
201 | /// crates may. | |
202 | /// | |
203 | /// This limit is not checked until the entire Ast is parsed. Therefore, | |
204 | /// if callers want to put a limit on the amount of heap space used, then | |
205 | /// they should impose a limit on the length, in bytes, of the concrete | |
206 | /// pattern string. In particular, this is viable since this parser | |
207 | /// implementation will limit itself to heap space proportional to the | |
208 | /// lenth of the pattern string. | |
209 | /// | |
210 | /// Note that a nest limit of `0` will return a nest limit error for most | |
211 | /// patterns but not all. For example, a nest limit of `0` permits `a` but | |
212 | /// not `ab`, since `ab` requires a concatenation, which results in a nest | |
213 | /// depth of `1`. In general, a nest limit is not something that manifests | |
214 | /// in an obvious way in the concrete syntax, therefore, it should not be | |
215 | /// used in a granular way. | |
216 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { | |
217 | self.0.nest_limit = limit; | |
218 | self | |
219 | } | |
8bb4bdeb XL |
220 | } |
221 | } | |
222 | } | |
223 | } | |
224 | ||
225 | define_builder!(bytes, re_bytes, false); | |
226 | define_builder!(unicode, re_unicode, true); | |
227 | ||
228 | macro_rules! define_set_builder { | |
229 | ($name:ident, $regex_mod:ident, $only_utf8:expr) => { | |
230 | pub mod $name { | |
231 | use error::Error; | |
232 | use exec::ExecBuilder; | |
233 | use super::RegexOptions; | |
234 | ||
235 | use re_set::$regex_mod::RegexSet; | |
236 | ||
237 | /// A configurable builder for a set of regular expressions. | |
238 | /// | |
239 | /// A builder can be used to configure how the regexes are built, for example, | |
240 | /// by setting the default flags (which can be overridden in the expression | |
241 | /// itself) or setting various limits. | |
242 | pub struct RegexSetBuilder(RegexOptions); | |
243 | ||
244 | impl RegexSetBuilder { | |
245 | /// Create a new regular expression builder with the given pattern. | |
246 | /// | |
247 | /// If the pattern is invalid, then an error will be returned when | |
ff7c6d11 | 248 | /// `build` is called. |
8bb4bdeb XL |
249 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
250 | where S: AsRef<str>, I: IntoIterator<Item=S> { | |
251 | let mut builder = RegexSetBuilder(RegexOptions::default()); | |
252 | for pat in patterns { | |
253 | builder.0.pats.push(pat.as_ref().to_owned()); | |
254 | } | |
255 | builder | |
256 | } | |
257 | ||
258 | /// Consume the builder and compile the regular expressions into a set. | |
259 | pub fn build(&self) -> Result<RegexSet, Error> { | |
260 | ExecBuilder::new_options(self.0.clone()) | |
261 | .only_utf8($only_utf8) | |
262 | .build() | |
263 | .map(RegexSet::from) | |
264 | } | |
265 | ||
266 | /// Set the value for the case insensitive (`i`) flag. | |
267 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
268 | self.0.case_insensitive = yes; | |
269 | self | |
270 | } | |
271 | ||
272 | /// Set the value for the multi-line matching (`m`) flag. | |
273 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
274 | self.0.multi_line = yes; | |
275 | self | |
276 | } | |
277 | ||
278 | /// Set the value for the any character (`s`) flag, where in `.` matches | |
279 | /// anything when `s` is set and matches anything except for new line when | |
280 | /// it is not set (the default). | |
281 | /// | |
282 | /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` | |
283 | /// expressions and means "any Unicode scalar value" for `regex::RegexSet` | |
284 | /// expressions. | |
285 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
286 | self.0.dot_matches_new_line = yes; | |
287 | self | |
288 | } | |
289 | ||
290 | /// Set the value for the greedy swap (`U`) flag. | |
291 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
292 | self.0.swap_greed = yes; | |
293 | self | |
294 | } | |
295 | ||
296 | /// Set the value for the ignore whitespace (`x`) flag. | |
297 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
298 | self.0.ignore_whitespace = yes; | |
299 | self | |
300 | } | |
301 | ||
302 | /// Set the value for the Unicode (`u`) flag. | |
8bb4bdeb XL |
303 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
304 | self.0.unicode = yes; | |
305 | self | |
306 | } | |
307 | ||
94b46f34 XL |
308 | /// Whether to support octal syntax or not. |
309 | /// | |
310 | /// Octal syntax is a little-known way of uttering Unicode codepoints in | |
311 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and | |
312 | /// `\141` are all equivalent regular expressions, where the last example | |
313 | /// shows octal syntax. | |
314 | /// | |
315 | /// While supporting octal syntax isn't in and of itself a problem, it does | |
316 | /// make good error messages harder. That is, in PCRE based regex engines, | |
317 | /// syntax like `\0` invokes a backreference, which is explicitly | |
318 | /// unsupported in Rust's regex engine. However, many users expect it to | |
319 | /// be supported. Therefore, when octal support is disabled, the error | |
320 | /// message will explicitly mention that backreferences aren't supported. | |
321 | /// | |
322 | /// Octal syntax is disabled by default. | |
323 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { | |
324 | self.0.octal = yes; | |
325 | self | |
326 | } | |
327 | ||
8bb4bdeb XL |
328 | /// Set the approximate size limit of the compiled regular expression. |
329 | /// | |
330 | /// This roughly corresponds to the number of bytes occupied by a single | |
331 | /// compiled program. If the program exceeds this number, then a | |
332 | /// compilation error is returned. | |
333 | pub fn size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder { | |
334 | self.0.size_limit = limit; | |
335 | self | |
336 | } | |
337 | ||
338 | /// Set the approximate size of the cache used by the DFA. | |
339 | /// | |
340 | /// This roughly corresponds to the number of bytes that the DFA will | |
341 | /// use while searching. | |
342 | /// | |
343 | /// Note that this is a *per thread* limit. There is no way to set a global | |
344 | /// limit. In particular, if a regex is used from multiple threads | |
345 | /// simulanteously, then each thread may use up to the number of bytes | |
346 | /// specified here. | |
347 | pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder { | |
348 | self.0.dfa_size_limit = limit; | |
349 | self | |
350 | } | |
0531ce1d XL |
351 | |
352 | /// Set the nesting limit for this parser. | |
353 | /// | |
354 | /// The nesting limit controls how deep the abstract syntax tree is allowed | |
355 | /// to be. If the AST exceeds the given limit (e.g., with too many nested | |
356 | /// groups), then an error is returned by the parser. | |
357 | /// | |
358 | /// The purpose of this limit is to act as a heuristic to prevent stack | |
359 | /// overflow for consumers that do structural induction on an `Ast` using | |
360 | /// explicit recursion. While this crate never does this (instead using | |
361 | /// constant stack space and moving the call stack to the heap), other | |
362 | /// crates may. | |
363 | /// | |
364 | /// This limit is not checked until the entire Ast is parsed. Therefore, | |
365 | /// if callers want to put a limit on the amount of heap space used, then | |
366 | /// they should impose a limit on the length, in bytes, of the concrete | |
367 | /// pattern string. In particular, this is viable since this parser | |
368 | /// implementation will limit itself to heap space proportional to the | |
369 | /// lenth of the pattern string. | |
370 | /// | |
371 | /// Note that a nest limit of `0` will return a nest limit error for most | |
372 | /// patterns but not all. For example, a nest limit of `0` permits `a` but | |
373 | /// not `ab`, since `ab` requires a concatenation, which results in a nest | |
374 | /// depth of `1`. In general, a nest limit is not something that manifests | |
375 | /// in an obvious way in the concrete syntax, therefore, it should not be | |
376 | /// used in a granular way. | |
377 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { | |
378 | self.0.nest_limit = limit; | |
379 | self | |
380 | } | |
381 | ||
8bb4bdeb XL |
382 | } |
383 | } | |
384 | } | |
385 | } | |
386 | ||
387 | define_set_builder!(set_bytes, bytes, false); | |
388 | define_set_builder!(set_unicode, unicode, true); |