]>
git.proxmox.com Git - cargo.git/blob - vendor/regex/src/re_builder.rs
1 /// The set of user configurable options for compiling zero or more regexes.
2 #[derive(Clone, Debug)]
4 pub struct RegexOptions
{
7 pub dfa_size_limit
: usize,
9 pub case_insensitive
: bool
,
11 pub dot_matches_new_line
: bool
,
13 pub ignore_whitespace
: bool
,
18 impl Default
for RegexOptions
{
19 fn default() -> Self {
22 size_limit
: 10 * (1 << 20),
23 dfa_size_limit
: 2 * (1 << 20),
25 case_insensitive
: false,
27 dot_matches_new_line
: false,
29 ignore_whitespace
: false,
36 macro_rules
! define_builder
{
37 ($name
:ident
, $regex_mod
:ident
, $only_utf8
:expr
) => {
39 use super::RegexOptions
;
40 use crate::error
::Error
;
41 use crate::exec
::ExecBuilder
;
43 use crate::$regex_mod
::Regex
;
45 /// A configurable builder for a regular expression.
47 /// A builder can be used to configure how the regex is built, for example, by
48 /// setting the default flags (which can be overridden in the expression
49 /// itself) or setting various limits.
51 pub struct RegexBuilder(RegexOptions
);
54 /// Create a new regular expression builder with the given pattern.
56 /// If the pattern is invalid, then an error will be returned when
57 /// `build` is called.
58 pub fn new(pattern
: &str) -> RegexBuilder
{
59 let mut builder
= RegexBuilder(RegexOptions
::default());
60 builder
.0.pats
.push(pattern
.to_owned());
64 /// Consume the builder and compile the regular expression.
66 /// Note that calling `as_str` on the resulting `Regex` will produce the
67 /// pattern given to `new` verbatim. Notably, it will not incorporate any
68 /// of the flags set on this builder.
69 pub fn build(&self) -> Result
<Regex
, Error
> {
70 ExecBuilder
::new_options(self.0.clone())
71 .only_utf8($only_utf8
)
76 /// Set the value for the case insensitive (`i`) flag.
78 /// When enabled, letters in the pattern will match both upper case and
79 /// lower case variants.
80 pub fn case_insensitive(
83 ) -> &mut RegexBuilder
{
84 self.0.case_insensitive
= yes
;
88 /// Set the value for the multi-line matching (`m`) flag.
90 /// When enabled, `^` matches the beginning of lines and `$` matches the
93 /// By default, they match beginning/end of the input.
94 pub fn multi_line(&mut self, yes
: bool
) -> &mut RegexBuilder
{
95 self.0.multi_line
= yes
;
99 /// Set the value for the any character (`s`) flag, where in `.` matches
100 /// anything when `s` is set and matches anything except for new line when
101 /// it is not set (the default).
103 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
104 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
105 /// Unicode is enabled.
106 pub fn dot_matches_new_line(
109 ) -> &mut RegexBuilder
{
110 self.0.dot_matches_new_line
= yes
;
114 /// Set the value for the greedy swap (`U`) flag.
116 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
117 /// match) and `a*?` is greedy (tries to find longest match).
119 /// By default, `a*` is greedy and `a*?` is lazy.
120 pub fn swap_greed(&mut self, yes
: bool
) -> &mut RegexBuilder
{
121 self.0.swap_greed
= yes
;
125 /// Set the value for the ignore whitespace (`x`) flag.
127 /// When enabled, whitespace such as new lines and spaces will be ignored
128 /// between expressions of the pattern, and `#` can be used to start a
129 /// comment until the next new line.
130 pub fn ignore_whitespace(
133 ) -> &mut RegexBuilder
{
134 self.0.ignore_whitespace
= yes
;
138 /// Set the value for the Unicode (`u`) flag.
140 /// Enabled by default. When disabled, character classes such as `\w` only
141 /// match ASCII word characters instead of all Unicode word characters.
142 pub fn unicode(&mut self, yes
: bool
) -> &mut RegexBuilder
{
143 self.0.unicode
= yes
;
147 /// Whether to support octal syntax or not.
149 /// Octal syntax is a little-known way of uttering Unicode codepoints in
150 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
151 /// `\141` are all equivalent regular expressions, where the last example
152 /// shows octal syntax.
154 /// While supporting octal syntax isn't in and of itself a problem, it does
155 /// make good error messages harder. That is, in PCRE based regex engines,
156 /// syntax like `\0` invokes a backreference, which is explicitly
157 /// unsupported in Rust's regex engine. However, many users expect it to
158 /// be supported. Therefore, when octal support is disabled, the error
159 /// message will explicitly mention that backreferences aren't supported.
161 /// Octal syntax is disabled by default.
162 pub fn octal(&mut self, yes
: bool
) -> &mut RegexBuilder
{
167 /// Set the approximate size limit of the compiled regular expression.
169 /// This roughly corresponds to the number of bytes occupied by a single
170 /// compiled program. If the program exceeds this number, then a
171 /// compilation error is returned.
175 ) -> &mut RegexBuilder
{
176 self.0.size_limit
= limit
;
180 /// Set the approximate size of the cache used by the DFA.
182 /// This roughly corresponds to the number of bytes that the DFA will
183 /// use while searching.
185 /// Note that this is a *per thread* limit. There is no way to set a global
186 /// limit. In particular, if a regex is used from multiple threads
187 /// simultaneously, then each thread may use up to the number of bytes
189 pub fn dfa_size_limit(
192 ) -> &mut RegexBuilder
{
193 self.0.dfa_size_limit
= limit
;
197 /// Set the nesting limit for this parser.
199 /// The nesting limit controls how deep the abstract syntax tree is allowed
200 /// to be. If the AST exceeds the given limit (e.g., with too many nested
201 /// groups), then an error is returned by the parser.
203 /// The purpose of this limit is to act as a heuristic to prevent stack
204 /// overflow for consumers that do structural induction on an `Ast` using
205 /// explicit recursion. While this crate never does this (instead using
206 /// constant stack space and moving the call stack to the heap), other
209 /// This limit is not checked until the entire Ast is parsed. Therefore,
210 /// if callers want to put a limit on the amount of heap space used, then
211 /// they should impose a limit on the length, in bytes, of the concrete
212 /// pattern string. In particular, this is viable since this parser
213 /// implementation will limit itself to heap space proportional to the
214 /// length of the pattern string.
216 /// Note that a nest limit of `0` will return a nest limit error for most
217 /// patterns but not all. For example, a nest limit of `0` permits `a` but
218 /// not `ab`, since `ab` requires a concatenation, which results in a nest
219 /// depth of `1`. In general, a nest limit is not something that manifests
220 /// in an obvious way in the concrete syntax, therefore, it should not be
221 /// used in a granular way.
222 pub fn nest_limit(&mut self, limit
: u32) -> &mut RegexBuilder
{
223 self.0.nest_limit
= limit
;
231 define_builder
!(bytes
, re_bytes
, false);
232 define_builder
!(unicode
, re_unicode
, true);
234 macro_rules
! define_set_builder
{
235 ($name
:ident
, $regex_mod
:ident
, $only_utf8
:expr
) => {
237 use super::RegexOptions
;
238 use crate::error
::Error
;
239 use crate::exec
::ExecBuilder
;
241 use crate::re_set
::$regex_mod
::RegexSet
;
243 /// A configurable builder for a set of regular expressions.
245 /// A builder can be used to configure how the regexes are built, for example,
246 /// by setting the default flags (which can be overridden in the expression
247 /// itself) or setting various limits.
249 pub struct RegexSetBuilder(RegexOptions
);
251 impl RegexSetBuilder
{
252 /// Create a new regular expression builder with the given pattern.
254 /// If the pattern is invalid, then an error will be returned when
255 /// `build` is called.
256 pub fn new
<I
, S
>(patterns
: I
) -> RegexSetBuilder
259 I
: IntoIterator
<Item
= S
>,
261 let mut builder
= RegexSetBuilder(RegexOptions
::default());
262 for pat
in patterns
{
263 builder
.0.pats
.push(pat
.as_ref().to_owned());
268 /// Consume the builder and compile the regular expressions into a set.
269 pub fn build(&self) -> Result
<RegexSet
, Error
> {
270 ExecBuilder
::new_options(self.0.clone())
271 .only_utf8($only_utf8
)
276 /// Set the value for the case insensitive (`i`) flag.
277 pub fn case_insensitive(
280 ) -> &mut RegexSetBuilder
{
281 self.0.case_insensitive
= yes
;
285 /// Set the value for the multi-line matching (`m`) flag.
289 ) -> &mut RegexSetBuilder
{
290 self.0.multi_line
= yes
;
294 /// Set the value for the any character (`s`) flag, where in `.` matches
295 /// anything when `s` is set and matches anything except for new line when
296 /// it is not set (the default).
298 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
299 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
301 pub fn dot_matches_new_line(
304 ) -> &mut RegexSetBuilder
{
305 self.0.dot_matches_new_line
= yes
;
309 /// Set the value for the greedy swap (`U`) flag.
313 ) -> &mut RegexSetBuilder
{
314 self.0.swap_greed
= yes
;
318 /// Set the value for the ignore whitespace (`x`) flag.
319 pub fn ignore_whitespace(
322 ) -> &mut RegexSetBuilder
{
323 self.0.ignore_whitespace
= yes
;
327 /// Set the value for the Unicode (`u`) flag.
328 pub fn unicode(&mut self, yes
: bool
) -> &mut RegexSetBuilder
{
329 self.0.unicode
= yes
;
333 /// Whether to support octal syntax or not.
335 /// Octal syntax is a little-known way of uttering Unicode codepoints in
336 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
337 /// `\141` are all equivalent regular expressions, where the last example
338 /// shows octal syntax.
340 /// While supporting octal syntax isn't in and of itself a problem, it does
341 /// make good error messages harder. That is, in PCRE based regex engines,
342 /// syntax like `\0` invokes a backreference, which is explicitly
343 /// unsupported in Rust's regex engine. However, many users expect it to
344 /// be supported. Therefore, when octal support is disabled, the error
345 /// message will explicitly mention that backreferences aren't supported.
347 /// Octal syntax is disabled by default.
348 pub fn octal(&mut self, yes
: bool
) -> &mut RegexSetBuilder
{
353 /// Set the approximate size limit of the compiled regular expression.
355 /// This roughly corresponds to the number of bytes occupied by a single
356 /// compiled program. If the program exceeds this number, then a
357 /// compilation error is returned.
361 ) -> &mut RegexSetBuilder
{
362 self.0.size_limit
= limit
;
366 /// Set the approximate size of the cache used by the DFA.
368 /// This roughly corresponds to the number of bytes that the DFA will
369 /// use while searching.
371 /// Note that this is a *per thread* limit. There is no way to set a global
372 /// limit. In particular, if a regex is used from multiple threads
373 /// simultaneously, then each thread may use up to the number of bytes
375 pub fn dfa_size_limit(
378 ) -> &mut RegexSetBuilder
{
379 self.0.dfa_size_limit
= limit
;
383 /// Set the nesting limit for this parser.
385 /// The nesting limit controls how deep the abstract syntax tree is allowed
386 /// to be. If the AST exceeds the given limit (e.g., with too many nested
387 /// groups), then an error is returned by the parser.
389 /// The purpose of this limit is to act as a heuristic to prevent stack
390 /// overflow for consumers that do structural induction on an `Ast` using
391 /// explicit recursion. While this crate never does this (instead using
392 /// constant stack space and moving the call stack to the heap), other
395 /// This limit is not checked until the entire Ast is parsed. Therefore,
396 /// if callers want to put a limit on the amount of heap space used, then
397 /// they should impose a limit on the length, in bytes, of the concrete
398 /// pattern string. In particular, this is viable since this parser
399 /// implementation will limit itself to heap space proportional to the
400 /// length of the pattern string.
402 /// Note that a nest limit of `0` will return a nest limit error for most
403 /// patterns but not all. For example, a nest limit of `0` permits `a` but
404 /// not `ab`, since `ab` requires a concatenation, which results in a nest
405 /// depth of `1`. In general, a nest limit is not something that manifests
406 /// in an obvious way in the concrete syntax, therefore, it should not be
407 /// used in a granular way.
411 ) -> &mut RegexSetBuilder
{
412 self.0.nest_limit
= limit
;
420 define_set_builder
!(set_bytes
, bytes
, false);
421 define_set_builder
!(set_unicode
, unicode
, true);