]> git.proxmox.com Git - rustc.git/blob - src/vendor/pest_derive/src/lib.rs
New upstream version 1.31.0+dfsg1
[rustc.git] / src / vendor / pest_derive / src / lib.rs
1 // pest. The Elegant Parser
2 // Copyright (c) 2018 DragoČ™ Tiselice
3 //
4 // Licensed under the Apache License, Version 2.0
5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. All files in the project carrying such notice may not be copied,
8 // modified, or distributed except according to those terms.
9
10 //! # pest. The Elegant Parser
11 //!
12 //! pest is a [PEG](https://en.wikipedia.org/wiki/Parsing_expression_grammar) parser built with
13 //! *simplicity* and *speed* in mind.
14 //!
15 //! This crate works in conjunction with the [`pest` crate](https://docs.rs/pest) by
16 //! deriving a grammar implementation based on a provided grammar.
17 //!
18 //! ## `.pest` files
19 //!
20 //! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
21 //! relative to `src` and is specified between the `derive` attribute and empty `struct` that
22 //! `Parser` will be derived on.
23 //!
24 //! Because of a limitation in procedural macros, there is no way for Cargo to know that a module
25 //! needs to be recompiled based on the file that the procedural macro is opening. This leads to the
26 //! case where modifying a `.pest` file without touching the file where the `derive` is does not
27 //! recompile it if it already has a working binary in the cache. To avoid this issue, the grammar
28 //! file can be included in a dummy `const` definition while debugging.
29 //!
30 //! ```ignore
31 //! #[cfg(debug_assertions)]
32 //! const _GRAMMAR: &'static str = include_str!("path/to/my_grammar.pest"); // relative to this file
33 //!
34 //! #[derive(Parser)]
35 //! #[grammar = "path/to/my_grammar.pest"] // relative to src
36 //! struct MyParser;
37 //! ```
38 //!
39 //! ## Grammar
40 //!
41 //! A grammar is a series of rules separated by whitespace, possibly containing comments.
42 //!
43 //! ### Comments
44 //!
45 //! Comments start with `//` and end at the end of the line.
46 //!
47 //! ```ignore
48 //! // a comment
49 //! ```
50 //!
51 //! ### Rules
52 //!
53 //! Rules have the following form:
54 //!
55 //! ```ignore
56 //! name = optional_modifier { expression }
57 //! ```
58 //!
59 //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
60 //! first character is not a digit and is used to create token pairs. When the rule starts being
61 //! parsed, the starting part of the token is being produced, with the ending part being produced
62 //! when the rule finishes parsing.
63 //!
64 //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
65 //! `b`, start `c`, end `c`, end `a`.
66 //!
67 //! #### Modifiers
68 //!
69 //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
70 //! behavior of the rules.
71 //!
72 //! 1. Silent (`_`)
73 //!
74 //! Silent rules do not create token pairs during parsing, nor are they error-reported.
75 //!
76 //! ```ignore
77 //! a = _{ "a" }
78 //! b = { a ~ "b" }
79 //! ```
80 //!
81 //! Parsing `"ab"` produces the token pair `b()`.
82 //!
83 //! 2. Atomic (`@`)
84 //!
85 //! Atomic rules do not accept whitespace or comments within their expressions and have a
86 //! cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
87 //! rules behave atomically.
88 //!
89 //! Any rules called by atomic rules do not generate token pairs.
90 //!
91 //! ```ignore
92 //! a = { "a" }
93 //! b = @{ a ~ "b" }
94 //!
95 //! whitespace = _{ " " }
96 //! ```
97 //!
98 //! Parsing `"ab"` produces the token pair `b()`, while `"a b"` produces an error.
99 //!
100 //! 3. Compound-atomic (`$`)
101 //!
102 //! Compound-atomic are identical to atomic rules with the exception that rules called by them are
103 //! not forbidden from generating token pairs.
104 //!
105 //! ```ignore
106 //! a = { "a" }
107 //! b = ${ a ~ "b" }
108 //!
109 //! whitespace = _{ " " }
110 //! ```
111 //!
112 //! Parsing `"ab"` produces the token pairs `b(a())`, while `"a b"` produces an error.
113 //!
114 //! 4. Non-atomic (`!`)
115 //!
116 //! Non-atomic are identical to normal rules with the exception that they stop the cascading effect
117 //! of atomic and compound-atomic rules.
118 //!
119 //! ```ignore
120 //! a = { "a" }
121 //! b = !{ a ~ "b" }
122 //! c = @{ b }
123 //!
124 //! whitespace = _{ " " }
125 //! ```
126 //!
127 //! Parsing both `"ab"` and `"a b"` produce the token pairs `c(a())`.
128 //!
129 //! #### Expressions
130 //!
131 //! Expressions can be either terminals or non-terminals.
132 //!
133 //! 1. Terminals
134 //!
135 //! | Terminal | Usage |
136 //! |------------|----------------------------------------------------------------|
137 //! | `"a"` | matches the exact string `"a"` |
138 //! | `^"a"` | matches the exact string `"a"` case insensitively (ASCII only) |
139 //! | `'a'..'z'` | matches one character between `'a'` and `'z'` |
140 //! | `a` | matches rule `a` |
141 //!
142 //! Strings and characters follow
143 //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
144 //! identifiers can contain alpha-numeric characters and underscores (`_`), as long as they do not
145 //! start with a digit.
146 //!
147 //! 2. Non-terminals
148 //!
149 //! | Non-terminal | Usage |
150 //! |--------------|------------------------------------------------------------|
151 //! | `(e)` | matches `e` |
152 //! | `e1 ~ e2` | matches the sequence `e1` `e2` |
153 //! | `e1 | e2` | matches either `e1` or `e2` |
154 //! | `e*` | matches `e` zero or more times |
155 //! | `e+` | matches `e` one or more times |
156 //! | `e{n}` | matches `e` exactly `n` times |
157 //! | `e{, n}` | matches `e` at most `n` times |
158 //! | `e{n,} ` | matches `e` at least `n` times |
159 //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively |
160 //! | `e?` | optionally matches `e` |
161 //! | `&e` | matches `e` without making progress |
162 //! | `!e` | matches if `e` doesn't match without making progress |
163 //! | `push(e)` | matches `e` and pushes it's captured string down the stack |
164 //!
165 //! where `e`, `e1`, and `e2` are expressions.
166 //!
167 //! ## Special rules
168 //!
169 //! Special rules can be called within the grammar. They are:
170 //!
171 //! * `whitespace` - gets run between rules and sub-rules
172 //! * `comment` - gets run between rules and sub-rules
173 //! * `any` - matches exactly one `char`
174 //! * `soi` - (start-of-input) matches only when a `Parser` is still at the starting position
175 //! * `eoi` - (end-of-input) matches only when a `Parser` has reached its end
176 //! * `pop` - pops a string from the stack and matches it
177 //! * `peek` - peeks a string from the stack and matches it
178 //!
179 //! `whitespace` and `comment` should be defined manually if needed. All other rules cannot be
180 //! overridden.
181 //!
182 //! ## `whitespace` and `comment`
183 //!
184 //! When defined, these rules get matched automatically in sequences (`~`) and repetitions
185 //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
186 //! from this behavior.
187 //!
188 //! These rules should be defined so as to match one whitespace character and one comment only since
189 //! they are run in repetitions.
190 //!
191 //! If both `whitespace` and `comment` are defined, this grammar:
192 //!
193 //! ```ignore
194 //! a = { b ~ c }
195 //! ```
196 //!
197 //! is effectively transformed into this one behind the scenes:
198 //!
199 //! ```ignore
200 //! a = { b ~ whitespace* ~ (comment ~ whitespace*)* ~ c }
201 //! ```
202 //!
203 //! ## `push`, `pop`, and `peek`
204 //!
205 //! `push(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
206 //! then later be used to match grammar based on its content with `pop` and `peek`.
207 //!
208 //! `peek` always matches the string at the top of stack. So, if the stack contains `["a", "b"]`,
209 //! the this grammar:
210 //!
211 //! ```ignore
212 //! a = { peek }
213 //! ```
214 //!
215 //! is effectively transformed into at parse time:
216 //!
217 //! ```ignore
218 //! a = { "a" }
219 //! ```
220 //!
221 //! `pop` works the same way with the exception that it pops the string off of the stack if the
222 //! the match worked. With the stack from above, if `pop` matches `"a"`, the stack will be mutated
223 //! to `["b"]`.
224 //!
225 //! ## `Rule`
226 //!
227 //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
228 //! implements `pest`'s `RuleType` and can be used throughout the API.
229
230 #![doc(html_root_url = "https://docs.rs/pest_derive")]
231 #![recursion_limit = "256"]
232
233 #[cfg(test)]
234 #[macro_use]
235 extern crate pest;
236 #[cfg(not(test))]
237 extern crate pest;
238
239 extern crate proc_macro;
240 #[macro_use]
241 extern crate quote;
242 extern crate syn;
243
244 use std::env;
245 use std::fs::File;
246 use std::io::{self, Read};
247 use std::path::Path;
248
249 use pest::Parser;
250 use proc_macro::TokenStream;
251 use quote::Ident;
252 use syn::{Attribute, Lit, MetaItem};
253
254 mod ast;
255 mod generator;
256 mod optimizer;
257 mod parser;
258 mod validator;
259
260 use parser::{GrammarParser, GrammarRule};
261
262 #[proc_macro_derive(Parser, attributes(grammar))]
263 pub fn derive_parser(input: TokenStream) -> TokenStream {
264 let source = input.to_string();
265
266 let (name, path) = parse_derive(source);
267
268 let root = env::var("CARGO_MANIFEST_DIR").unwrap_or(".".into());
269 let path = Path::new(&root).join("src/").join(&path);
270 let file_name = match path.file_name() {
271 Some(file_name) => file_name,
272 None => panic!("grammar attribute should point to a file")
273 };
274
275 let data = match read_file(&path) {
276 Ok(data) => data,
277 Err(error) => panic!("error opening {:?}: {}", file_name, error)
278 };
279
280 let input = &data;
281 let pairs = match GrammarParser::parse(GrammarRule::grammar_rules, input) {
282 Ok(pairs) => pairs,
283 Err(error) => panic!(
284 "error parsing {:?}\n\n{}",
285 file_name,
286 error.renamed_rules(|rule| match *rule {
287 GrammarRule::grammar_rule => "rule".to_owned(),
288 GrammarRule::eoi => "end-of-input".to_owned(),
289 GrammarRule::assignment_operator => "`=`".to_owned(),
290 GrammarRule::silent_modifier => "`_`".to_owned(),
291 GrammarRule::atomic_modifier => "`@`".to_owned(),
292 GrammarRule::compound_atomic_modifier => "`$`".to_owned(),
293 GrammarRule::non_atomic_modifier => "`!`".to_owned(),
294 GrammarRule::opening_brace => "`{`".to_owned(),
295 GrammarRule::closing_brace => "`}`".to_owned(),
296 GrammarRule::opening_paren => "`(`".to_owned(),
297 GrammarRule::positive_predicate_operator => "`&`".to_owned(),
298 GrammarRule::negative_predicate_operator => "`!`".to_owned(),
299 GrammarRule::sequence_operator => "`&`".to_owned(),
300 GrammarRule::choice_operator => "`|`".to_owned(),
301 GrammarRule::optional_operator => "`?`".to_owned(),
302 GrammarRule::repeat_operator => "`*`".to_owned(),
303 GrammarRule::repeat_once_operator => "`+`".to_owned(),
304 GrammarRule::comma => "`,`".to_owned(),
305 GrammarRule::closing_paren => "`)`".to_owned(),
306 GrammarRule::quote => "`\"`".to_owned(),
307 GrammarRule::insensitive_string => "`^`".to_owned(),
308 GrammarRule::range_operator => "`..`".to_owned(),
309 GrammarRule::single_quote => "`'`".to_owned(),
310 other_rule => format!("{:?}", other_rule)
311 })
312 )
313 };
314
315 let (ast, defaults) = parser::consume_rules(pairs);
316 let optimized = optimizer::optimize(ast);
317 let generated = generator::generate(name, optimized, defaults);
318
319 generated.as_ref().parse().unwrap()
320 }
321
322 fn read_file<P: AsRef<Path>>(path: P) -> io::Result<String> {
323 let mut file = File::open(path.as_ref())?;
324 let mut string = String::new();
325 file.read_to_string(&mut string)?;
326 Ok(string)
327 }
328
329 fn parse_derive(source: String) -> (Ident, String) {
330 let ast = syn::parse_derive_input(&source).unwrap();
331 let name = Ident::new(ast.ident.as_ref());
332
333 let grammar: Vec<_> = ast.attrs
334 .iter()
335 .filter(|attr| match attr.value {
336 MetaItem::NameValue(ref ident, _) => format!("{}", ident) == "grammar",
337 _ => false
338 })
339 .collect();
340
341 let filename = match grammar.len() {
342 0 => panic!("a grammar file needs to be provided with the #[grammar(\"...\")] attribute"),
343 1 => get_filename(grammar[0]),
344 _ => panic!("only 1 grammar file can be provided")
345 };
346
347 (name, filename)
348 }
349
350 fn get_filename(attr: &Attribute) -> String {
351 if let MetaItem::NameValue(_, ref lit) = attr.value {
352 if let &Lit::Str(ref string, _) = lit {
353 string.clone()
354 } else {
355 panic!("grammar attribute must be a string")
356 }
357 } else {
358 unreachable!();
359 }
360 }
361
362 #[cfg(test)]
363 mod tests {
364 use super::parse_derive;
365
366 #[test]
367 fn derive_ok() {
368 let definition = "
369 #[other_attr]
370 #[grammar = \"myfile.pest\"]
371 pub struct MyParser<'a, T>;
372 ";
373 let (_, filename) = parse_derive(definition.to_owned());
374
375 assert_eq!(filename, "myfile.pest");
376 }
377
378 #[test]
379 #[should_panic(expected = "only 1 grammar file can be provided")]
380 fn derive_multiple_grammars() {
381 let definition = "
382 #[other_attr]
383 #[grammar = \"myfile1.pest\"]
384 #[grammar = \"myfile2.pest\"]
385 pub struct MyParser<'a, T>;
386 ";
387 parse_derive(definition.to_owned());
388 }
389
390 #[test]
391 #[should_panic(expected = "grammar attribute must be a string")]
392 fn derive_wrong_arg() {
393 let definition = "
394 #[other_attr]
395 #[grammar = 1]
396 pub struct MyParser<'a, T>;
397 ";
398 parse_derive(definition.to_owned());
399 }
400 }