[rustc.git] / vendor / regex-syntax / src / parser.rs

use ast;
use hir;

use Result;

/// A builder for a regular expression parser.
///
/// This builder permits modifying configuration options for the parser.
///
/// This type combines the builder options for both the
/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
/// and the
/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
#[derive(Clone, Debug, Default)]
pub struct ParserBuilder {
    ast: ast::parse::ParserBuilder,
    hir: hir::translate::TranslatorBuilder,
}

impl ParserBuilder {
    /// Create a new parser builder with a default configuration.
    pub fn new() -> ParserBuilder {
        ParserBuilder::default()
    }

    /// Build a parser from this configuration with the given pattern.
    pub fn build(&self) -> Parser {
        Parser { ast: self.ast.build(), hir: self.hir.build() }
    }

    /// Set the nesting limit for this parser.
    ///
    /// The nesting limit controls how deep the abstract syntax tree is allowed
    /// to be. If the AST exceeds the given limit (e.g., with too many nested
    /// groups), then an error is returned by the parser.
    ///
    /// The purpose of this limit is to act as a heuristic to prevent stack
    /// overflow for consumers that do structural induction on an `Ast` using
    /// explicit recursion. While this crate never does this (instead using
    /// constant stack space and moving the call stack to the heap), other
    /// crates may.
    ///
    /// This limit is not checked until the entire Ast is parsed. Therefore,
    /// if callers want to put a limit on the amount of heap space used, then
    /// they should impose a limit on the length, in bytes, of the concrete
    /// pattern string. In particular, this is viable since this parser
    /// implementation will limit itself to heap space proportional to the
    /// lenth of the pattern string.
    ///
    /// Note that a nest limit of `0` will return a nest limit error for most
    /// patterns but not all. For example, a nest limit of `0` permits `a` but
    /// not `ab`, since `ab` requires a concatenation, which results in a nest
    /// depth of `1`. In general, a nest limit is not something that manifests
    /// in an obvious way in the concrete syntax, therefore, it should not be
    /// used in a granular way.
    pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
        self.ast.nest_limit(limit);
        self
    }

    /// Whether to support octal syntax or not.
    ///
    /// Octal syntax is a little-known way of uttering Unicode codepoints in
    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
    /// `\141` are all equivalent regular expressions, where the last example
    /// shows octal syntax.
    ///
    /// While supporting octal syntax isn't in and of itself a problem, it does
    /// make good error messages harder. That is, in PCRE based regex engines,
    /// syntax like `\0` invokes a backreference, which is explicitly
    /// unsupported in Rust's regex engine. However, many users expect it to
    /// be supported. Therefore, when octal support is disabled, the error
    /// message will explicitly mention that backreferences aren't supported.
    ///
    /// Octal syntax is disabled by default.
    pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
        self.ast.octal(yes);
        self
    }

    /// When enabled, the parser will permit the construction of a regular
    /// expression that may match invalid UTF-8.
    ///
    /// When disabled (the default), the parser is guaranteed to produce
    /// an expression that will only ever match valid UTF-8 (otherwise, the
    /// parser will return an error).
    ///
    /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
    /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
    /// the parser to return an error. Namely, a negated ASCII word boundary
    /// can result in matching positions that aren't valid UTF-8 boundaries.
    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.allow_invalid_utf8(yes);
        self
    }

    /// Enable verbose mode in the regular expression.
    ///
    /// When enabled, verbose mode permits insigificant whitespace in many
    /// places in the regular expression, as well as comments. Comments are
    /// started using `#` and continue until the end of the line.
    ///
    /// By default, this is disabled. It may be selectively enabled in the
    /// regular expression by using the `x` flag regardless of this setting.
    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
        self.ast.ignore_whitespace(yes);
        self
    }

    /// Enable or disable the case insensitive flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `i` flag.
    pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.case_insensitive(yes);
        self
    }

    /// Enable or disable the multi-line matching flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `m` flag.
    pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.multi_line(yes);
        self
    }

    /// Enable or disable the "dot matches any character" flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `s` flag.
    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.dot_matches_new_line(yes);
        self
    }

    /// Enable or disable the "swap greed" flag by default.
    ///
    /// By default this is disabled. It may alternatively be selectively
    /// enabled in the regular expression itself via the `U` flag.
    pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.swap_greed(yes);
        self
    }

    /// Enable or disable the Unicode flag (`u`) by default.
    ///
    /// By default this is **enabled**. It may alternatively be selectively
    /// disabled in the regular expression itself via the `u` flag.
    ///
    /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
    /// default), a regular expression will fail to parse if Unicode mode is
    /// disabled and a sub-expression could possibly match invalid UTF-8.
    pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
        self.hir.unicode(yes);
        self
    }
}

/// A convenience parser for regular expressions.
///
/// This parser takes as input a regular expression pattern string (the
/// "concrete syntax") and returns a high-level intermediate representation
/// (the HIR) suitable for most types of analysis. In particular, this parser
/// hides the intermediate state of producing an AST (the "abstract syntax").
/// The AST is itself far more complex than the HIR, so this parser serves as a
/// convenience for never having to deal with it at all.
///
/// If callers have more fine grained use cases that need an AST, then please
/// see the [`ast::parse`](ast/parse/index.html) module.
///
/// A `Parser` can be configured in more detail via a
/// [`ParserBuilder`](struct.ParserBuilder.html).
#[derive(Clone, Debug)]
pub struct Parser {
    ast: ast::parse::Parser,
    hir: hir::translate::Translator,
}

impl Parser {
    /// Create a new parser with a default configuration.
    ///
    /// The parser can be run with `parse` method. The parse method returns
    /// a high level intermediate representation of the given regular
    /// expression.
    ///
    /// To set configuration options on the parser, use
    /// [`ParserBuilder`](struct.ParserBuilder.html).
    pub fn new() -> Parser {
        ParserBuilder::new().build()
    }

    /// Parse the regular expression into a high level intermediate
    /// representation.
    pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
        let ast = self.ast.parse(pattern)?;
        let hir = self.hir.translate(pattern, &ast)?;
        Ok(hir)
    }
}
Commit	Line	Data
0531ce1d XL	1	use ast;
	2	use hir;
	3
	4	use Result;
	5
	6	/// A builder for a regular expression parser.
	7	///
	8	/// This builder permits modifying configuration options for the parser.
	9	///
	10	/// This type combines the builder options for both the
	11	/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
	12	/// and the
	13	/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
	14	#[derive(Clone, Debug, Default)]
	15	pub struct ParserBuilder {
	16	ast: ast::parse::ParserBuilder,
	17	hir: hir::translate::TranslatorBuilder,
	18	}
	19
	20	impl ParserBuilder {
	21	/// Create a new parser builder with a default configuration.
	22	pub fn new() -> ParserBuilder {
	23	ParserBuilder::default()
	24	}
	25
	26	/// Build a parser from this configuration with the given pattern.
	27	pub fn build(&self) -> Parser {
f9f354fc	28	Parser { ast: self.ast.build(), hir: self.hir.build() }
0531ce1d XL	29	}
	30
	31	/// Set the nesting limit for this parser.
	32	///
	33	/// The nesting limit controls how deep the abstract syntax tree is allowed
	34	/// to be. If the AST exceeds the given limit (e.g., with too many nested
	35	/// groups), then an error is returned by the parser.
	36	///
	37	/// The purpose of this limit is to act as a heuristic to prevent stack
	38	/// overflow for consumers that do structural induction on an `Ast` using
	39	/// explicit recursion. While this crate never does this (instead using
	40	/// constant stack space and moving the call stack to the heap), other
	41	/// crates may.
	42	///
	43	/// This limit is not checked until the entire Ast is parsed. Therefore,
	44	/// if callers want to put a limit on the amount of heap space used, then
	45	/// they should impose a limit on the length, in bytes, of the concrete
	46	/// pattern string. In particular, this is viable since this parser
	47	/// implementation will limit itself to heap space proportional to the
	48	/// lenth of the pattern string.
	49	///
	50	/// Note that a nest limit of `0` will return a nest limit error for most
	51	/// patterns but not all. For example, a nest limit of `0` permits `a` but
	52	/// not `ab`, since `ab` requires a concatenation, which results in a nest
	53	/// depth of `1`. In general, a nest limit is not something that manifests
	54	/// in an obvious way in the concrete syntax, therefore, it should not be
	55	/// used in a granular way.
	56	pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
	57	self.ast.nest_limit(limit);
	58	self
	59	}
	60
	61	/// Whether to support octal syntax or not.
	62	///
	63	/// Octal syntax is a little-known way of uttering Unicode codepoints in
	64	/// a regular expression. For example, `a`, `\x61`, `\u0061` and
	65	/// `\141` are all equivalent regular expressions, where the last example
	66	/// shows octal syntax.
	67	///
	68	/// While supporting octal syntax isn't in and of itself a problem, it does
	69	/// make good error messages harder. That is, in PCRE based regex engines,
	70	/// syntax like `\0` invokes a backreference, which is explicitly
	71	/// unsupported in Rust's regex engine. However, many users expect it to
	72	/// be supported. Therefore, when octal support is disabled, the error
	73	/// message will explicitly mention that backreferences aren't supported.
	74	///
	75	/// Octal syntax is disabled by default.
	76	pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
	77	self.ast.octal(yes);
	78	self
	79	}
	80
	81	/// When enabled, the parser will permit the construction of a regular
	82	/// expression that may match invalid UTF-8.
	83	///
	84	/// When disabled (the default), the parser is guaranteed to produce
	85	/// an expression that will only ever match valid UTF-8 (otherwise, the
	86	/// parser will return an error).
	87	///
b7449926 XL	88	/// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
	89	/// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
	90	/// the parser to return an error. Namely, a negated ASCII word boundary
	91	/// can result in matching positions that aren't valid UTF-8 boundaries.
0531ce1d XL	92	pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
	93	self.hir.allow_invalid_utf8(yes);
	94	self
	95	}
	96
	97	/// Enable verbose mode in the regular expression.
	98	///
	99	/// When enabled, verbose mode permits insigificant whitespace in many
	100	/// places in the regular expression, as well as comments. Comments are
	101	/// started using `#` and continue until the end of the line.
	102	///
	103	/// By default, this is disabled. It may be selectively enabled in the
	104	/// regular expression by using the `x` flag regardless of this setting.
	105	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
	106	self.ast.ignore_whitespace(yes);
	107	self
	108	}
	109
	110	/// Enable or disable the case insensitive flag by default.
	111	///
	112	/// By default this is disabled. It may alternatively be selectively
	113	/// enabled in the regular expression itself via the `i` flag.
	114	pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
	115	self.hir.case_insensitive(yes);
	116	self
	117	}
	118
	119	/// Enable or disable the multi-line matching flag by default.
	120	///
	121	/// By default this is disabled. It may alternatively be selectively
	122	/// enabled in the regular expression itself via the `m` flag.
	123	pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
	124	self.hir.multi_line(yes);
	125	self
	126	}
	127
	128	/// Enable or disable the "dot matches any character" flag by default.
	129	///
	130	/// By default this is disabled. It may alternatively be selectively
	131	/// enabled in the regular expression itself via the `s` flag.
f9f354fc	132	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
0531ce1d XL	133	self.hir.dot_matches_new_line(yes);
	134	self
	135	}
	136
	137	/// Enable or disable the "swap greed" flag by default.
	138	///
	139	/// By default this is disabled. It may alternatively be selectively
	140	/// enabled in the regular expression itself via the `U` flag.
	141	pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
	142	self.hir.swap_greed(yes);
	143	self
	144	}
	145
	146	/// Enable or disable the Unicode flag (`u`) by default.
	147	///
	148	/// By default this is enabled. It may alternatively be selectively
	149	/// disabled in the regular expression itself via the `u` flag.
	150	///
	151	/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
	152	/// default), a regular expression will fail to parse if Unicode mode is
	153	/// disabled and a sub-expression could possibly match invalid UTF-8.
	154	pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
	155	self.hir.unicode(yes);
	156	self
	157	}
	158	}
	159
	160	/// A convenience parser for regular expressions.
	161	///
	162	/// This parser takes as input a regular expression pattern string (the
	163	/// "concrete syntax") and returns a high-level intermediate representation
	164	/// (the HIR) suitable for most types of analysis. In particular, this parser
	165	/// hides the intermediate state of producing an AST (the "abstract syntax").
	166	/// The AST is itself far more complex than the HIR, so this parser serves as a
	167	/// convenience for never having to deal with it at all.
	168	///
	169	/// If callers have more fine grained use cases that need an AST, then please
	170	/// see the [`ast::parse`](ast/parse/index.html) module.
	171	///
	172	/// A `Parser` can be configured in more detail via a
	173	/// [`ParserBuilder`](struct.ParserBuilder.html).
	174	#[derive(Clone, Debug)]
	175	pub struct Parser {
	176	ast: ast::parse::Parser,
	177	hir: hir::translate::Translator,
	178	}
	179
	180	impl Parser {
	181	/// Create a new parser with a default configuration.
	182	///
	183	/// The parser can be run with `parse` method. The parse method returns
	184	/// a high level intermediate representation of the given regular
	185	/// expression.
	186	///
	187	/// To set configuration options on the parser, use
	188	/// [`ParserBuilder`](struct.ParserBuilder.html).
	189	pub fn new() -> Parser {
	190	ParserBuilder::new().build()
	191	}
	192
	193	/// Parse the regular expression into a high level intermediate
	194	/// representation.
	195	pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
94b46f34 XL	196	let ast = self.ast.parse(pattern)?;
94b46f34 XL	197	let hir = self.hir.translate(pattern, &ast)?;
0531ce1d XL	198	Ok(hir)
	199	}
	200	}