src/vendor/pest_derive/src/lib.rs

   1 // pest. The Elegant Parser
   2 // Copyright (c) 2018 Dragoș Tiselice
   3 //
   4 // Licensed under the Apache License, Version 2.0
   5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
   6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   7 // option. All files in the project carrying such notice may not be copied,
   8 // modified, or distributed except according to those terms.
   9
  10 //! # pest. The Elegant Parser
  11 //!
  12 //! pest is a [PEG](https://en.wikipedia.org/wiki/Parsing_expression_grammar) parser built with
  13 //! *simplicity* and *speed* in mind.
  14 //!
  15 //! This crate works in conjunction with the [`pest` crate](https://docs.rs/pest) by
  16 //! deriving a grammar implementation based on a provided grammar.
  17 //!
  18 //! ## `.pest` files
  19 //!
  20 //! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
  21 //! relative to `src` and is specified between the `derive` attribute and empty `struct` that
  22 //! `Parser` will be derived on.
  23 //!
  24 //! Because of a limitation in procedural macros, there is no way for Cargo to know that a module
  25 //! needs to be recompiled based on the file that the procedural macro is opening. This leads to the
  26 //! case where modifying a `.pest` file without touching the file where the `derive` is does not
  27 //! recompile it if it already has a working binary in the cache. To avoid this issue, the grammar
  28 //! file can be included in a dummy `const` definition while debugging.
  29 //!
  30 //! ```ignore
  31 //! #[cfg(debug_assertions)]
  32 //! const _GRAMMAR: &'static str = include_str!("path/to/my_grammar.pest"); // relative to this file
  33 //!
  34 //! #[derive(Parser)]
  35 //! #[grammar = "path/to/my_grammar.pest"] // relative to src
  36 //! struct MyParser;
  37 //! ```
  38 //!
  39 //! ## Grammar
  40 //!
  41 //! A grammar is a series of rules separated by whitespace, possibly containing comments.
  42 //!
  43 //! ### Comments
  44 //!
  45 //! Comments start with `//` and end at the end of the line.
  46 //!
  47 //! ```ignore
  48 //! // a comment
  49 //! ```
  50 //!
  51 //! ### Rules
  52 //!
  53 //! Rules have the following form:
  54 //!
  55 //! ```ignore
  56 //! name = optional_modifier { expression }
  57 //! ```
  58 //!
  59 //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
  60 //! first character is not a digit and is used to create token pairs. When the rule starts being
  61 //! parsed, the starting part of the token is being produced, with the ending part being produced
  62 //! when the rule finishes parsing.
  63 //!
  64 //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
  65 //! `b`, start `c`, end `c`, end `a`.
  66 //!
  67 //! #### Modifiers
  68 //!
  69 //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
  70 //! behavior of the rules.
  71 //!
  72 //! 1. Silent (`_`)
  73 //!
  74 //!     Silent rules do not create token pairs during parsing, nor are they error-reported.
  75 //!
  76 //!     ```ignore
  77 //!     a = _{ "a" }
  78 //!     b =  { a ~ "b" }
  79 //!     ```
  80 //!
  81 //!     Parsing `"ab"` produces the token pair `b()`.
  82 //!
  83 //! 2. Atomic (`@`)
  84 //!
  85 //!     Atomic rules do not accept whitespace or comments within their expressions and have a
  86 //!     cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
  87 //!     rules behave atomically.
  88 //!
  89 //!     Any rules called by atomic rules do not generate token pairs.
  90 //!
  91 //!     ```ignore
  92 //!     a =  { "a" }
  93 //!     b = @{ a ~ "b" }
  94 //!
  95 //!     whitespace = _{ " " }
  96 //!     ```
  97 //!
  98 //!     Parsing `"ab"` produces the token pair `b()`, while `"a   b"` produces an error.
  99 //!
 100 //! 3. Compound-atomic (`$`)
 101 //!
 102 //!     Compound-atomic are identical to atomic rules with the exception that rules called by them are
 103 //!     not forbidden from generating token pairs.
 104 //!
 105 //!     ```ignore
 106 //!     a =  { "a" }
 107 //!     b = ${ a ~ "b" }
 108 //!
 109 //!     whitespace = _{ " " }
 110 //!     ```
 111 //!
 112 //!     Parsing `"ab"` produces the token pairs `b(a())`, while `"a   b"` produces an error.
 113 //!
 114 //! 4. Non-atomic (`!`)
 115 //!
 116 //!     Non-atomic are identical to normal rules with the exception that they stop the cascading effect
 117 //!     of atomic and compound-atomic rules.
 118 //!
 119 //!     ```ignore
 120 //!     a =  { "a" }
 121 //!     b = !{ a ~ "b" }
 122 //!     c = @{ b }
 123 //!
 124 //!     whitespace = _{ " " }
 125 //!     ```
 126 //!
 127 //!     Parsing both `"ab"` and `"a   b"` produce the token pairs `c(a())`.
 128 //!
 129 //! #### Expressions
 130 //!
 131 //! Expressions can be either terminals or non-terminals.
 132 //!
 133 //! 1. Terminals
 134 //!
 135 //!     | Terminal   | Usage                                                          |
 136 //!     |------------|----------------------------------------------------------------|
 137 //!     | `"a"`      | matches the exact string `"a"`                                 |
 138 //!     | `^"a"`     | matches the exact string `"a"` case insensitively (ASCII only) |
 139 //!     | `'a'..'z'` | matches one character between `'a'` and `'z'`                  |
 140 //!     | `a`        | matches rule `a`                                               |
 141 //!
 142 //! Strings and characters follow
 143 //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
 144 //! identifiers can contain alpha-numeric characters and underscores (`_`), as long as they do not
 145 //! start with a digit.
 146 //!
 147 //! 2. Non-terminals
 148 //!
 149 //!     | Non-terminal | Usage                                                      |
 150 //!     |--------------|------------------------------------------------------------|
 151 //!     | `(e)`        | matches `e`                                                |
 152 //!     | `e1 ~ e2`    | matches the sequence `e1` `e2`                             |
 153 //!     | `e1 | e2`    | matches either `e1` or `e2`                                |
 154 //!     | `e*`         | matches `e` zero or more times                             |
 155 //!     | `e+`         | matches `e` one or more times                              |
 156 //!     | `e{n}`       | matches `e` exactly `n` times                              |
 157 //!     | `e{, n}`     | matches `e` at most `n` times                              |
 158 //!     | `e{n,} `     | matches `e` at least `n` times                             |
 159 //!     | `e{m, n}`    | matches `e` between `m` and `n` times inclusively          |
 160 //!     | `e?`         | optionally matches `e`                                     |
 161 //!     | `&e`         | matches `e` without making progress                        |
 162 //!     | `!e`         | matches if `e` doesn't match without making progress       |
 163 //!     | `push(e)`    | matches `e` and pushes it's captured string down the stack |
 164 //!
 165 //!     where `e`, `e1`, and `e2` are expressions.
 166 //!
 167 //! ## Special rules
 168 //!
 169 //! Special rules can be called within the grammar. They are:
 170 //!
 171 //! * `whitespace` - gets run between rules and sub-rules
 172 //! * `comment` - gets run between rules and sub-rules
 173 //! * `any` - matches exactly one `char`
 174 //! * `soi` - (start-of-input) matches only when a `Parser` is still at the starting position
 175 //! * `eoi` - (end-of-input) matches only when a `Parser` has reached its end
 176 //! * `pop` - pops a string from the stack and matches it
 177 //! * `peek` - peeks a string from the stack and matches it
 178 //!
 179 //! `whitespace` and `comment` should be defined manually if needed. All other rules cannot be
 180 //! overridden.
 181 //!
 182 //! ## `whitespace` and `comment`
 183 //!
 184 //! When defined, these rules get matched automatically in sequences (`~`) and repetitions
 185 //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
 186 //! from this behavior.
 187 //!
 188 //! These rules should be defined so as to match one whitespace character and one comment only since
 189 //! they are run in repetitions.
 190 //!
 191 //! If both `whitespace` and `comment` are defined, this grammar:
 192 //!
 193 //! ```ignore
 194 //! a = { b ~ c }
 195 //! ```
 196 //!
 197 //! is effectively transformed into this one behind the scenes:
 198 //!
 199 //! ```ignore
 200 //! a = { b ~ whitespace* ~ (comment ~ whitespace*)* ~ c }
 201 //! ```
 202 //!
 203 //! ## `push`, `pop`, and `peek`
 204 //!
 205 //! `push(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
 206 //! then later be used to match grammar based on its content with `pop` and `peek`.
 207 //!
 208 //! `peek` always matches the string at the top of stack. So, if the stack contains `["a", "b"]`,
 209 //! the this grammar:
 210 //!
 211 //! ```ignore
 212 //! a = { peek }
 213 //! ```
 214 //!
 215 //! is effectively transformed into at parse time:
 216 //!
 217 //! ```ignore
 218 //! a = { "a" }
 219 //! ```
 220 //!
 221 //! `pop` works the same way with the exception that it pops the string off of the stack if the
 222 //! the match worked. With the stack from above, if `pop` matches `"a"`, the stack will be mutated
 223 //! to `["b"]`.
 224 //!
 225 //! ## `Rule`
 226 //!
 227 //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
 228 //! implements `pest`'s `RuleType` and can be used throughout the API.
 229
 230 #![doc(html_root_url = "https://docs.rs/pest_derive")]
 231 #![recursion_limit = "256"]
 232
 233 #[cfg(test)]
 234 #[macro_use]
 235 extern crate pest;
 236 #[cfg(not(test))]
 237 extern crate pest;
 238
 239 extern crate proc_macro;
 240 #[macro_use]
 241 extern crate quote;
 242 extern crate syn;
 243
 244 use std::env;
 245 use std::fs::File;
 246 use std::io::{self, Read};
 247 use std::path::Path;
 248
 249 use pest::Parser;
 250 use proc_macro::TokenStream;
 251 use quote::Ident;
 252 use syn::{Attribute, Lit, MetaItem};
 253
 254 mod ast;
 255 mod generator;
 256 mod optimizer;
 257 mod parser;
 258 mod validator;
 259
 260 use parser::{GrammarParser, GrammarRule};
 261
 262 #[proc_macro_derive(Parser, attributes(grammar))]
 263 pub fn derive_parser(input: TokenStream) -> TokenStream {
 264     let source = input.to_string();
 265
 266     let (name, path) = parse_derive(source);
 267
 268     let root = env::var("CARGO_MANIFEST_DIR").unwrap_or(".".into());
 269     let path = Path::new(&root).join("src/").join(&path);
 270     let file_name = match path.file_name() {
 271         Some(file_name) => file_name,
 272         None => panic!("grammar attribute should point to a file")
 273     };
 274
 275     let data = match read_file(&path) {
 276         Ok(data) => data,
 277         Err(error) => panic!("error opening {:?}: {}", file_name, error)
 278     };
 279
 280     let input = &data;
 281     let pairs = match GrammarParser::parse(GrammarRule::grammar_rules, input) {
 282         Ok(pairs) => pairs,
 283         Err(error) => panic!(
 284             "error parsing {:?}\n\n{}",
 285             file_name,
 286             error.renamed_rules(|rule| match *rule {
 287                 GrammarRule::grammar_rule => "rule".to_owned(),
 288                 GrammarRule::eoi => "end-of-input".to_owned(),
 289                 GrammarRule::assignment_operator => "`=`".to_owned(),
 290                 GrammarRule::silent_modifier => "`_`".to_owned(),
 291                 GrammarRule::atomic_modifier => "`@`".to_owned(),
 292                 GrammarRule::compound_atomic_modifier => "`$`".to_owned(),
 293                 GrammarRule::non_atomic_modifier => "`!`".to_owned(),
 294                 GrammarRule::opening_brace => "`{`".to_owned(),
 295                 GrammarRule::closing_brace => "`}`".to_owned(),
 296                 GrammarRule::opening_paren => "`(`".to_owned(),
 297                 GrammarRule::positive_predicate_operator => "`&`".to_owned(),
 298                 GrammarRule::negative_predicate_operator => "`!`".to_owned(),
 299                 GrammarRule::sequence_operator => "`&`".to_owned(),
 300                 GrammarRule::choice_operator => "`|`".to_owned(),
 301                 GrammarRule::optional_operator => "`?`".to_owned(),
 302                 GrammarRule::repeat_operator => "`*`".to_owned(),
 303                 GrammarRule::repeat_once_operator => "`+`".to_owned(),
 304                 GrammarRule::comma => "`,`".to_owned(),
 305                 GrammarRule::closing_paren => "`)`".to_owned(),
 306                 GrammarRule::quote => "`\"`".to_owned(),
 307                 GrammarRule::insensitive_string => "`^`".to_owned(),
 308                 GrammarRule::range_operator => "`..`".to_owned(),
 309                 GrammarRule::single_quote => "`'`".to_owned(),
 310                 other_rule => format!("{:?}", other_rule)
 311             })
 312         )
 313     };
 314
 315     let (ast, defaults) = parser::consume_rules(pairs);
 316     let optimized = optimizer::optimize(ast);
 317     let generated = generator::generate(name, optimized, defaults);
 318
 319     generated.as_ref().parse().unwrap()
 320 }
 321
 322 fn read_file<P: AsRef<Path>>(path: P) -> io::Result<String> {
 323     let mut file = File::open(path.as_ref())?;
 324     let mut string = String::new();
 325     file.read_to_string(&mut string)?;
 326     Ok(string)
 327 }
 328
 329 fn parse_derive(source: String) -> (Ident, String) {
 330     let ast = syn::parse_derive_input(&source).unwrap();
 331     let name = Ident::new(ast.ident.as_ref());
 332
 333     let grammar: Vec<_> = ast.attrs
 334         .iter()
 335         .filter(|attr| match attr.value {
 336             MetaItem::NameValue(ref ident, _) => format!("{}", ident) == "grammar",
 337             _ => false
 338         })
 339         .collect();
 340
 341     let filename = match grammar.len() {
 342         0 => panic!("a grammar file needs to be provided with the #[grammar(\"...\")] attribute"),
 343         1 => get_filename(grammar[0]),
 344         _ => panic!("only 1 grammar file can be provided")
 345     };
 346
 347     (name, filename)
 348 }
 349
 350 fn get_filename(attr: &Attribute) -> String {
 351     if let MetaItem::NameValue(_, ref lit) = attr.value {
 352         if let &Lit::Str(ref string, _) = lit {
 353             string.clone()
 354         } else {
 355             panic!("grammar attribute must be a string")
 356         }
 357     } else {
 358         unreachable!();
 359     }
 360 }
 361
 362 #[cfg(test)]
 363 mod tests {
 364     use super::parse_derive;
 365
 366     #[test]
 367     fn derive_ok() {
 368         let definition = "
 369             #[other_attr]
 370             #[grammar = \"myfile.pest\"]
 371             pub struct MyParser<'a, T>;
 372         ";
 373         let (_, filename) = parse_derive(definition.to_owned());
 374
 375         assert_eq!(filename, "myfile.pest");
 376     }
 377
 378     #[test]
 379     #[should_panic(expected = "only 1 grammar file can be provided")]
 380     fn derive_multiple_grammars() {
 381         let definition = "
 382             #[other_attr]
 383             #[grammar = \"myfile1.pest\"]
 384             #[grammar = \"myfile2.pest\"]
 385             pub struct MyParser<'a, T>;
 386         ";
 387         parse_derive(definition.to_owned());
 388     }
 389
 390     #[test]
 391     #[should_panic(expected = "grammar attribute must be a string")]
 392     fn derive_wrong_arg() {
 393         let definition = "
 394             #[other_attr]
 395             #[grammar = 1]
 396             pub struct MyParser<'a, T>;
 397         ";
 398         parse_derive(definition.to_owned());
 399     }
 400 }