1 // pest. The Elegant Parser
2 // Copyright (c) 2018 DragoČ™ Tiselice
4 // Licensed under the Apache License, Version 2.0
5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. All files in the project carrying such notice may not be copied,
8 // modified, or distributed except according to those terms.
10 //! # pest. The Elegant Parser
12 //! pest is a [PEG](https://en.wikipedia.org/wiki/Parsing_expression_grammar) parser built with
13 //! *simplicity* and *speed* in mind.
15 //! This crate works in conjunction with the [`pest` crate](https://docs.rs/pest) by
16 //! deriving a grammar implementation based on a provided grammar.
20 //! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
21 //! relative to `src` and is specified between the `derive` attribute and empty `struct` that
22 //! `Parser` will be derived on.
24 //! Because of a limitation in procedural macros, there is no way for Cargo to know that a module
25 //! needs to be recompiled based on the file that the procedural macro is opening. This leads to the
26 //! case where modifying a `.pest` file without touching the file where the `derive` is does not
27 //! recompile it if it already has a working binary in the cache. To avoid this issue, the grammar
28 //! file can be included in a dummy `const` definition while debugging.
31 //! #[cfg(debug_assertions)]
32 //! const _GRAMMAR: &'static str = include_str!("path/to/my_grammar.pest"); // relative to this file
35 //! #[grammar = "path/to/my_grammar.pest"] // relative to src
41 //! A grammar is a series of rules separated by whitespace, possibly containing comments.
45 //! Comments start with `//` and end at the end of the line.
53 //! Rules have the following form:
56 //! name = optional_modifier { expression }
59 //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
60 //! first character is not a digit and is used to create token pairs. When the rule starts being
61 //! parsed, the starting part of the token is being produced, with the ending part being produced
62 //! when the rule finishes parsing.
64 //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
65 //! `b`, start `c`, end `c`, end `a`.
69 //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
70 //! behavior of the rules.
74 //! Silent rules do not create token pairs during parsing, nor are they error-reported.
81 //! Parsing `"ab"` produces the token pair `b()`.
85 //! Atomic rules do not accept whitespace or comments within their expressions and have a
86 //! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
87 //! rules behave atomically.
89 //! Any rules called by atomic rules do not generate token pairs.
95 //! whitespace = _{ " " }
98 //! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error.
100 //! 3. Compound-atomic (`$`)
102 //! Compound-atomic are identical to atomic rules with the exception that rules called by them are
103 //! not forbidden from generating token pairs.
109 //! whitespace = _{ " " }
112 //! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error.
114 //! 4. Non-atomic (`!`)
116 //! Non-atomic are identical to normal rules with the exception that they stop the cascading effect
117 //! of atomic and compound-atomic rules.
124 //! whitespace = _{ " " }
127 //! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`.
131 //! Expressions can be either terminals or non-terminals.
135 //! | Terminal | Usage |
136 //! |------------|----------------------------------------------------------------|
137 //! | `"a"` | matches the exact string `"a"` |
138 //! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) |
139 //! | `'a'..'z'` | matches one character between `'a'` and `'z'` |
140 //! | `a` | matches rule `a` |
142 //! Strings and characters follow
143 //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
144 //! identifiers can contain alpha-numeric characters and underscores (`_`), as long as they do not
145 //! start with a digit.
149 //! | Non-terminal | Usage |
150 //! |--------------|------------------------------------------------------------|
151 //! | `(e)` | matches `e` |
152 //! | `e1 ~ e2` | matches the sequence `e1` `e2` |
153 //! | `e1 | e2` | matches either `e1` or `e2` |
154 //! | `e*` | matches `e` zero or more times |
155 //! | `e+` | matches `e` one or more times |
156 //! | `e{n}` | matches `e` exactly `n` times |
157 //! | `e{, n}` | matches `e` at most `n` times |
158 //! | `e{n,} ` | matches `e` at least `n` times |
159 //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively |
160 //! | `e?` | optionally matches `e` |
161 //! | `&e` | matches `e` without making progress |
162 //! | `!e` | matches if `e` doesn't match without making progress |
163 //! | `push(e)` | matches `e` and pushes it's captured string down the stack |
165 //! where `e`, `e1`, and `e2` are expressions.
169 //! Special rules can be called within the grammar. They are:
171 //! * `whitespace` - gets run between rules and sub-rules
172 //! * `comment` - gets run between rules and sub-rules
173 //! * `any` - matches exactly one `char`
174 //! * `soi` - (start-of-input) matches only when a `Parser` is still at the starting position
175 //! * `eoi` - (end-of-input) matches only when a `Parser` has reached its end
176 //! * `pop` - pops a string from the stack and matches it
177 //! * `peek` - peeks a string from the stack and matches it
179 //! `whitespace` and `comment` should be defined manually if needed. All other rules cannot be
182 //! ## `whitespace` and `comment`
184 //! When defined, these rules get matched automatically in sequences (`~`) and repetitions
185 //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
186 //! from this behavior.
188 //! These rules should be defined so as to match one whitespace character and one comment only since
189 //! they are run in repetitions.
191 //! If both `whitespace` and `comment` are defined, this grammar:
197 //! is effectively transformed into this one behind the scenes:
200 //! a = { b ~ whitespace* ~ (comment ~ whitespace*)* ~ c }
203 //! ## `push`, `pop`, and `peek`
205 //! `push(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
206 //! then later be used to match grammar based on its content with `pop` and `peek`.
208 //! `peek` always matches the string at the top of stack. So, if the stack contains `["a", "b"]`,
209 //! the this grammar:
215 //! is effectively transformed into at parse time:
221 //! `pop` works the same way with the exception that it pops the string off of the stack if the
222 //! the match worked. With the stack from above, if `pop` matches `"a"`, the stack will be mutated
227 //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
228 //! implements `pest`'s `RuleType` and can be used throughout the API.
230 #![doc(html_root_url = "https://docs.rs/pest_derive")]
231 #![recursion_limit = "256"]
239 extern crate proc_macro
;
246 use std
::io
::{self, Read}
;
250 use proc_macro
::TokenStream
;
252 use syn
::{Attribute, Lit, MetaItem}
;
260 use parser
::{GrammarParser, GrammarRule}
;
262 #[proc_macro_derive(Parser, attributes(grammar))]
263 pub fn derive_parser(input
: TokenStream
) -> TokenStream
{
264 let source
= input
.to_string();
266 let (name
, path
) = parse_derive(source
);
268 let root
= env
::var("CARGO_MANIFEST_DIR").unwrap_or(".".into());
269 let path
= Path
::new(&root
).join("src/").join(&path
);
270 let file_name
= match path
.file_name() {
271 Some(file_name
) => file_name
,
272 None
=> panic
!("grammar attribute should point to a file")
275 let data
= match read_file(&path
) {
277 Err(error
) => panic
!("error opening {:?}: {}", file_name
, error
)
281 let pairs
= match GrammarParser
::parse(GrammarRule
::grammar_rules
, input
) {
283 Err(error
) => panic
!(
284 "error parsing {:?}\n\n{}",
286 error
.renamed_rules(|rule
| match *rule
{
287 GrammarRule
::grammar_rule
=> "rule".to_owned(),
288 GrammarRule
::eoi
=> "end-of-input".to_owned(),
289 GrammarRule
::assignment_operator
=> "`=`".to_owned(),
290 GrammarRule
::silent_modifier
=> "`_`".to_owned(),
291 GrammarRule
::atomic_modifier
=> "`@`".to_owned(),
292 GrammarRule
::compound_atomic_modifier
=> "`$`".to_owned(),
293 GrammarRule
::non_atomic_modifier
=> "`!`".to_owned(),
294 GrammarRule
::opening_brace
=> "`{`".to_owned(),
295 GrammarRule
::closing_brace
=> "`}`".to_owned(),
296 GrammarRule
::opening_paren
=> "`(`".to_owned(),
297 GrammarRule
::positive_predicate_operator
=> "`&`".to_owned(),
298 GrammarRule
::negative_predicate_operator
=> "`!`".to_owned(),
299 GrammarRule
::sequence_operator
=> "`&`".to_owned(),
300 GrammarRule
::choice_operator
=> "`|`".to_owned(),
301 GrammarRule
::optional_operator
=> "`?`".to_owned(),
302 GrammarRule
::repeat_operator
=> "`*`".to_owned(),
303 GrammarRule
::repeat_once_operator
=> "`+`".to_owned(),
304 GrammarRule
::comma
=> "`,`".to_owned(),
305 GrammarRule
::closing_paren
=> "`)`".to_owned(),
306 GrammarRule
::quote
=> "`\"`".to_owned(),
307 GrammarRule
::insensitive_string
=> "`^`".to_owned(),
308 GrammarRule
::range_operator
=> "`..`".to_owned(),
309 GrammarRule
::single_quote
=> "`'`".to_owned(),
310 other_rule
=> format
!("{:?}", other_rule
)
315 let (ast
, defaults
) = parser
::consume_rules(pairs
);
316 let optimized
= optimizer
::optimize(ast
);
317 let generated
= generator
::generate(name
, optimized
, defaults
);
319 generated
.as_ref().parse().unwrap()
322 fn read_file
<P
: AsRef
<Path
>>(path
: P
) -> io
::Result
<String
> {
323 let mut file
= File
::open(path
.as_ref())?
;
324 let mut string
= String
::new();
325 file
.read_to_string(&mut string
)?
;
329 fn parse_derive(source
: String
) -> (Ident
, String
) {
330 let ast
= syn
::parse_derive_input(&source
).unwrap();
331 let name
= Ident
::new(ast
.ident
.as_ref());
333 let grammar
: Vec
<_
> = ast
.attrs
335 .filter(|attr
| match attr
.value
{
336 MetaItem
::NameValue(ref ident
, _
) => format
!("{}", ident
) == "grammar",
341 let filename
= match grammar
.len() {
342 0 => panic
!("a grammar file needs to be provided with the #[grammar(\"...\")] attribute"),
343 1 => get_filename(grammar
[0]),
344 _
=> panic
!("only 1 grammar file can be provided")
350 fn get_filename(attr
: &Attribute
) -> String
{
351 if let MetaItem
::NameValue(_
, ref lit
) = attr
.value
{
352 if let &Lit
::Str(ref string
, _
) = lit
{
355 panic
!("grammar attribute must be a string")
364 use super::parse_derive
;
370 #[grammar = \"myfile.pest\"]
371 pub struct MyParser<'a, T>;
373 let (_
, filename
) = parse_derive(definition
.to_owned());
375 assert_eq
!(filename
, "myfile.pest");
379 #[should_panic(expected = "only 1 grammar file can be provided")]
380 fn derive_multiple_grammars() {
383 #[grammar = \"myfile1.pest\"]
384 #[grammar = \"myfile2.pest\"]
385 pub struct MyParser<'a, T>;
387 parse_derive(definition
.to_owned());
391 #[should_panic(expected = "grammar attribute must be a string")]
392 fn derive_wrong_arg() {
396 pub struct MyParser<'a, T>;
398 parse_derive(definition
.to_owned());