]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex/src/re_builder.rs
New upstream version 1.28.0~beta.14+dfsg1
[rustc.git] / src / vendor / regex / src / re_builder.rs
CommitLineData
8bb4bdeb
XL
1// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11/// The set of user configurable options for compiling zero or more regexes.
12#[derive(Clone, Debug)]
13#[allow(missing_docs)]
14pub struct RegexOptions {
15 pub pats: Vec<String>,
16 pub size_limit: usize,
17 pub dfa_size_limit: usize,
0531ce1d 18 pub nest_limit: u32,
8bb4bdeb
XL
19 pub case_insensitive: bool,
20 pub multi_line: bool,
21 pub dot_matches_new_line: bool,
22 pub swap_greed: bool,
23 pub ignore_whitespace: bool,
24 pub unicode: bool,
94b46f34 25 pub octal: bool,
8bb4bdeb
XL
26}
27
28impl Default for RegexOptions {
29 fn default() -> Self {
30 RegexOptions {
31 pats: vec![],
32 size_limit: 10 * (1<<20),
33 dfa_size_limit: 2 * (1<<20),
0531ce1d 34 nest_limit: 250,
8bb4bdeb
XL
35 case_insensitive: false,
36 multi_line: false,
37 dot_matches_new_line: false,
38 swap_greed: false,
39 ignore_whitespace: false,
40 unicode: true,
94b46f34 41 octal: false,
8bb4bdeb
XL
42 }
43 }
44}
45
46macro_rules! define_builder {
47 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
48 pub mod $name {
49 use error::Error;
50 use exec::ExecBuilder;
51 use super::RegexOptions;
52
53 use $regex_mod::Regex;
54
55/// A configurable builder for a regular expression.
56///
57/// A builder can be used to configure how the regex is built, for example, by
58/// setting the default flags (which can be overridden in the expression
59/// itself) or setting various limits.
60pub struct RegexBuilder(RegexOptions);
61
62impl RegexBuilder {
63 /// Create a new regular expression builder with the given pattern.
64 ///
65 /// If the pattern is invalid, then an error will be returned when
ff7c6d11 66 /// `build` is called.
8bb4bdeb
XL
67 pub fn new(pattern: &str) -> RegexBuilder {
68 let mut builder = RegexBuilder(RegexOptions::default());
69 builder.0.pats.push(pattern.to_owned());
70 builder
71 }
72
73 /// Consume the builder and compile the regular expression.
74 ///
75 /// Note that calling `as_str` on the resulting `Regex` will produce the
76 /// pattern given to `new` verbatim. Notably, it will not incorporate any
77 /// of the flags set on this builder.
78 pub fn build(&self) -> Result<Regex, Error> {
79 ExecBuilder::new_options(self.0.clone())
80 .only_utf8($only_utf8)
81 .build()
82 .map(Regex::from)
83 }
84
85 /// Set the value for the case insensitive (`i`) flag.
ff7c6d11
XL
86 ///
87 /// When enabled, letters in the pattern will match both upper case and
88 /// lower case variants.
8bb4bdeb
XL
89 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
90 self.0.case_insensitive = yes;
91 self
92 }
93
94 /// Set the value for the multi-line matching (`m`) flag.
ff7c6d11
XL
95 ///
96 /// When enabled, `^` matches the beginning of lines and `$` matches the
97 /// end of lines.
98 ///
99 /// By default, they match beginning/end of the input.
8bb4bdeb
XL
100 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
101 self.0.multi_line = yes;
102 self
103 }
104
105 /// Set the value for the any character (`s`) flag, where in `.` matches
106 /// anything when `s` is set and matches anything except for new line when
107 /// it is not set (the default).
108 ///
109 /// N.B. "matches anything" means "any byte" for `regex::bytes::Regex`
110 /// expressions and means "any Unicode scalar value" for `regex::Regex`
111 /// expressions.
112 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
113 self.0.dot_matches_new_line = yes;
114 self
115 }
116
117 /// Set the value for the greedy swap (`U`) flag.
ff7c6d11
XL
118 ///
119 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
120 /// match) and `a*?` is greedy (tries to find longest match).
121 ///
122 /// By default, `a*` is greedy and `a*?` is lazy.
8bb4bdeb
XL
123 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
124 self.0.swap_greed = yes;
125 self
126 }
127
128 /// Set the value for the ignore whitespace (`x`) flag.
ff7c6d11
XL
129 ///
130 /// When enabled, whitespace such as new lines and spaces will be ignored
131 /// between expressions of the pattern, and `#` can be used to start a
132 /// comment until the next new line.
8bb4bdeb
XL
133 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
134 self.0.ignore_whitespace = yes;
135 self
136 }
137
138 /// Set the value for the Unicode (`u`) flag.
ff7c6d11
XL
139 ///
140 /// Enabled by default. When disabled, character classes such as `\w` only
141 /// match ASCII word characters instead of all Unicode word characters.
8bb4bdeb
XL
142 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
143 self.0.unicode = yes;
144 self
145 }
146
94b46f34
XL
147 /// Whether to support octal syntax or not.
148 ///
149 /// Octal syntax is a little-known way of uttering Unicode codepoints in
150 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
151 /// `\141` are all equivalent regular expressions, where the last example
152 /// shows octal syntax.
153 ///
154 /// While supporting octal syntax isn't in and of itself a problem, it does
155 /// make good error messages harder. That is, in PCRE based regex engines,
156 /// syntax like `\0` invokes a backreference, which is explicitly
157 /// unsupported in Rust's regex engine. However, many users expect it to
158 /// be supported. Therefore, when octal support is disabled, the error
159 /// message will explicitly mention that backreferences aren't supported.
160 ///
161 /// Octal syntax is disabled by default.
162 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163 self.0.octal = yes;
164 self
165 }
166
8bb4bdeb
XL
167 /// Set the approximate size limit of the compiled regular expression.
168 ///
169 /// This roughly corresponds to the number of bytes occupied by a single
170 /// compiled program. If the program exceeds this number, then a
171 /// compilation error is returned.
172 pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
173 self.0.size_limit = limit;
174 self
175 }
176
177 /// Set the approximate size of the cache used by the DFA.
178 ///
179 /// This roughly corresponds to the number of bytes that the DFA will
180 /// use while searching.
181 ///
182 /// Note that this is a *per thread* limit. There is no way to set a global
183 /// limit. In particular, if a regex is used from multiple threads
ff7c6d11 184 /// simultaneously, then each thread may use up to the number of bytes
8bb4bdeb
XL
185 /// specified here.
186 pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexBuilder {
187 self.0.dfa_size_limit = limit;
188 self
189 }
0531ce1d
XL
190
191 /// Set the nesting limit for this parser.
192 ///
193 /// The nesting limit controls how deep the abstract syntax tree is allowed
194 /// to be. If the AST exceeds the given limit (e.g., with too many nested
195 /// groups), then an error is returned by the parser.
196 ///
197 /// The purpose of this limit is to act as a heuristic to prevent stack
198 /// overflow for consumers that do structural induction on an `Ast` using
199 /// explicit recursion. While this crate never does this (instead using
200 /// constant stack space and moving the call stack to the heap), other
201 /// crates may.
202 ///
203 /// This limit is not checked until the entire Ast is parsed. Therefore,
204 /// if callers want to put a limit on the amount of heap space used, then
205 /// they should impose a limit on the length, in bytes, of the concrete
206 /// pattern string. In particular, this is viable since this parser
207 /// implementation will limit itself to heap space proportional to the
208 /// lenth of the pattern string.
209 ///
210 /// Note that a nest limit of `0` will return a nest limit error for most
211 /// patterns but not all. For example, a nest limit of `0` permits `a` but
212 /// not `ab`, since `ab` requires a concatenation, which results in a nest
213 /// depth of `1`. In general, a nest limit is not something that manifests
214 /// in an obvious way in the concrete syntax, therefore, it should not be
215 /// used in a granular way.
216 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
217 self.0.nest_limit = limit;
218 self
219 }
8bb4bdeb
XL
220}
221 }
222 }
223}
224
225define_builder!(bytes, re_bytes, false);
226define_builder!(unicode, re_unicode, true);
227
228macro_rules! define_set_builder {
229 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
230 pub mod $name {
231 use error::Error;
232 use exec::ExecBuilder;
233 use super::RegexOptions;
234
235 use re_set::$regex_mod::RegexSet;
236
237/// A configurable builder for a set of regular expressions.
238///
239/// A builder can be used to configure how the regexes are built, for example,
240/// by setting the default flags (which can be overridden in the expression
241/// itself) or setting various limits.
242pub struct RegexSetBuilder(RegexOptions);
243
244impl RegexSetBuilder {
245 /// Create a new regular expression builder with the given pattern.
246 ///
247 /// If the pattern is invalid, then an error will be returned when
ff7c6d11 248 /// `build` is called.
8bb4bdeb
XL
249 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
250 where S: AsRef<str>, I: IntoIterator<Item=S> {
251 let mut builder = RegexSetBuilder(RegexOptions::default());
252 for pat in patterns {
253 builder.0.pats.push(pat.as_ref().to_owned());
254 }
255 builder
256 }
257
258 /// Consume the builder and compile the regular expressions into a set.
259 pub fn build(&self) -> Result<RegexSet, Error> {
260 ExecBuilder::new_options(self.0.clone())
261 .only_utf8($only_utf8)
262 .build()
263 .map(RegexSet::from)
264 }
265
266 /// Set the value for the case insensitive (`i`) flag.
267 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
268 self.0.case_insensitive = yes;
269 self
270 }
271
272 /// Set the value for the multi-line matching (`m`) flag.
273 pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
274 self.0.multi_line = yes;
275 self
276 }
277
278 /// Set the value for the any character (`s`) flag, where in `.` matches
279 /// anything when `s` is set and matches anything except for new line when
280 /// it is not set (the default).
281 ///
282 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
283 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
284 /// expressions.
285 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
286 self.0.dot_matches_new_line = yes;
287 self
288 }
289
290 /// Set the value for the greedy swap (`U`) flag.
291 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
292 self.0.swap_greed = yes;
293 self
294 }
295
296 /// Set the value for the ignore whitespace (`x`) flag.
297 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexSetBuilder {
298 self.0.ignore_whitespace = yes;
299 self
300 }
301
302 /// Set the value for the Unicode (`u`) flag.
8bb4bdeb
XL
303 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
304 self.0.unicode = yes;
305 self
306 }
307
94b46f34
XL
308 /// Whether to support octal syntax or not.
309 ///
310 /// Octal syntax is a little-known way of uttering Unicode codepoints in
311 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
312 /// `\141` are all equivalent regular expressions, where the last example
313 /// shows octal syntax.
314 ///
315 /// While supporting octal syntax isn't in and of itself a problem, it does
316 /// make good error messages harder. That is, in PCRE based regex engines,
317 /// syntax like `\0` invokes a backreference, which is explicitly
318 /// unsupported in Rust's regex engine. However, many users expect it to
319 /// be supported. Therefore, when octal support is disabled, the error
320 /// message will explicitly mention that backreferences aren't supported.
321 ///
322 /// Octal syntax is disabled by default.
323 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
324 self.0.octal = yes;
325 self
326 }
327
8bb4bdeb
XL
328 /// Set the approximate size limit of the compiled regular expression.
329 ///
330 /// This roughly corresponds to the number of bytes occupied by a single
331 /// compiled program. If the program exceeds this number, then a
332 /// compilation error is returned.
333 pub fn size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
334 self.0.size_limit = limit;
335 self
336 }
337
338 /// Set the approximate size of the cache used by the DFA.
339 ///
340 /// This roughly corresponds to the number of bytes that the DFA will
341 /// use while searching.
342 ///
343 /// Note that this is a *per thread* limit. There is no way to set a global
344 /// limit. In particular, if a regex is used from multiple threads
345 /// simulanteously, then each thread may use up to the number of bytes
346 /// specified here.
347 pub fn dfa_size_limit(&mut self, limit: usize) -> &mut RegexSetBuilder {
348 self.0.dfa_size_limit = limit;
349 self
350 }
0531ce1d
XL
351
352 /// Set the nesting limit for this parser.
353 ///
354 /// The nesting limit controls how deep the abstract syntax tree is allowed
355 /// to be. If the AST exceeds the given limit (e.g., with too many nested
356 /// groups), then an error is returned by the parser.
357 ///
358 /// The purpose of this limit is to act as a heuristic to prevent stack
359 /// overflow for consumers that do structural induction on an `Ast` using
360 /// explicit recursion. While this crate never does this (instead using
361 /// constant stack space and moving the call stack to the heap), other
362 /// crates may.
363 ///
364 /// This limit is not checked until the entire Ast is parsed. Therefore,
365 /// if callers want to put a limit on the amount of heap space used, then
366 /// they should impose a limit on the length, in bytes, of the concrete
367 /// pattern string. In particular, this is viable since this parser
368 /// implementation will limit itself to heap space proportional to the
369 /// lenth of the pattern string.
370 ///
371 /// Note that a nest limit of `0` will return a nest limit error for most
372 /// patterns but not all. For example, a nest limit of `0` permits `a` but
373 /// not `ab`, since `ab` requires a concatenation, which results in a nest
374 /// depth of `1`. In general, a nest limit is not something that manifests
375 /// in an obvious way in the concrete syntax, therefore, it should not be
376 /// used in a granular way.
377 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
378 self.0.nest_limit = limit;
379 self
380 }
381
8bb4bdeb
XL
382}
383 }
384 }
385}
386
387define_set_builder!(set_bytes, bytes, false);
388define_set_builder!(set_unicode, unicode, true);