src/libsyntax/ext/tt/macro_parser.rs

   1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // Earley-like parser for macros.
  12 use ast;
  13 use ast::{matcher, match_tok, match_seq, match_nonterminal, ident};
  14 use codemap::{BytePos, mk_sp};
  15 use codemap;
  16 use parse::lexer::*; //resolve bug?
  17 use parse::ParseSess;
  18 use parse::parser::Parser;
  19 use parse::token::{Token, EOF, to_str, nonterminal};
  20 use parse::token;
  21
  22 use core::prelude::*;
  23 use core::hashmap::linear::LinearMap;
  24
  25 /* This is an Earley-like parser, without support for in-grammar nonterminals,
  26 only by calling out to the main rust parser for named nonterminals (which it
  27 commits to fully when it hits one in a grammar). This means that there are no
  28 completer or predictor rules, and therefore no need to store one column per
  29 token: instead, there's a set of current Earley items and a set of next
  30 ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
  31 pathological cases, is worse than traditional Earley parsing, but it's an
  32 easier fit for Macro-by-Example-style rules, and I think the overhead is
  33 lower. (In order to prevent the pathological case, we'd need to lazily
  34 construct the resulting `named_match`es at the very end. It'd be a pain,
  35 and require more memory to keep around old items, but it would also save
  36 overhead)*/
  37
  38 /* Quick intro to how the parser works:
  39
  40 A 'position' is a dot in the middle of a matcher, usually represented as a
  41 dot. For example `· a $( a )* a b` is a position, as is `a $( · a )* a b`.
  42
  43 The parser walks through the input a character at a time, maintaining a list
  44 of items consistent with the current position in the input string: `cur_eis`.
  45
  46 As it processes them, it fills up `eof_eis` with items that would be valid if
  47 the macro invocation is now over, `bb_eis` with items that are waiting on
  48 a Rust nonterminal like `$e:expr`, and `next_eis` with items that are waiting
  49 on the a particular token. Most of the logic concerns moving the · through the
  50 repetitions indicated by Kleene stars. It only advances or calls out to the
  51 real Rust parser when no `cur_eis` items remain
  52
  53 Example: Start parsing `a a a a b` against [· a $( a )* a b].
  54
  55 Remaining input: `a a a a b`
  56 next_eis: [· a $( a )* a b]
  57
  58 - - - Advance over an `a`. - - -
  59
  60 Remaining input: `a a a b`
  61 cur: [a · $( a )* a b]
  62 Descend/Skip (first item).
  63 next: [a $( · a )* a b]  [a $( a )* · a b].
  64
  65 - - - Advance over an `a`. - - -
  66
  67 Remaining input: `a a b`
  68 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  69 Finish/Repeat (first item)
  70 next: [a $( a )* · a b]  [a $( · a )* a b]  [a $( a )* a · b]
  71
  72 - - - Advance over an `a`. - - - (this looks exactly like the last step)
  73
  74 Remaining input: `a b`
  75 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  76 Finish/Repeat (first item)
  77 next: [a $( a )* · a b]  [a $( · a )* a b]  [a $( a )* a · b]
  78
  79 - - - Advance over an `a`. - - - (this looks exactly like the last step)
  80
  81 Remaining input: `b`
  82 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  83 Finish/Repeat (first item)
  84 next: [a $( a )* · a b]  [a $( · a )* a b]
  85
  86 - - - Advance over a `b`. - - -
  87
  88 Remaining input: ``
  89 eof: [a $( a )* a b ·]
  90
  91  */
  92
  93
  94 /* to avoid costly uniqueness checks, we require that `match_seq` always has a
  95 nonempty body. */
  96
  97 pub enum matcher_pos_up { /* to break a circularity */
  98     matcher_pos_up(Option<~MatcherPos>)
  99 }
 100
 101 pub fn is_some(&&mpu: matcher_pos_up) -> bool {
 102     match &mpu {
 103       &matcher_pos_up(None) => false,
 104       _ => true
 105     }
 106 }
 107
 108 pub struct MatcherPos {
 109     elts: ~[ast::matcher], // maybe should be <'>? Need to understand regions.
 110     sep: Option<Token>,
 111     idx: uint,
 112     up: matcher_pos_up, // mutable for swapping only
 113     matches: ~[~[@named_match]],
 114     match_lo: uint, match_hi: uint,
 115     sp_lo: BytePos,
 116 }
 117
 118 pub fn copy_up(&& mpu: matcher_pos_up) -> ~MatcherPos {
 119     match &mpu {
 120       &matcher_pos_up(Some(ref mp)) => copy (*mp),
 121       _ => fail!()
 122     }
 123 }
 124
 125 pub fn count_names(ms: &[matcher]) -> uint {
 126     vec::foldl(0u, ms, |ct, m| {
 127         ct + match m.node {
 128           match_tok(_) => 0u,
 129           match_seq(ref more_ms, _, _, _, _) => count_names((*more_ms)),
 130           match_nonterminal(_,_,_) => 1u
 131         }})
 132 }
 133
 134 #[allow(non_implicitly_copyable_typarams)]
 135 pub fn initial_matcher_pos(+ms: ~[matcher], +sep: Option<Token>, lo: BytePos)
 136                         -> ~MatcherPos {
 137     let mut match_idx_hi = 0u;
 138     for ms.each |elt| {
 139         match elt.node {
 140           match_tok(_) => (),
 141           match_seq(_,_,_,_,hi) => {
 142             match_idx_hi = hi;       // it is monotonic...
 143           }
 144           match_nonterminal(_,_,pos) => {
 145             match_idx_hi = pos+1u;  // ...so latest is highest
 146           }
 147         }
 148     }
 149     let matches = vec::from_fn(count_names(ms), |_i| ~[]);
 150     ~MatcherPos {
 151         elts: ms,
 152         sep: sep,
 153         idx: 0u,
 154         up: matcher_pos_up(None),
 155         matches: matches,
 156         match_lo: 0u,
 157         match_hi: match_idx_hi,
 158         sp_lo: lo
 159     }
 160 }
 161
 162 // named_match is a pattern-match result for a single ast::match_nonterminal:
 163 // so it is associated with a single ident in a parse, and all
 164 // matched_nonterminals in the named_match have the same nonterminal type
 165 // (expr, item, etc). All the leaves in a single named_match correspond to a
 166 // single matcher_nonterminal in the ast::matcher that produced it.
 167 //
 168 // It should probably be renamed, it has more or less exact correspondence to
 169 // ast::match nodes, and the in-memory structure of a particular named_match
 170 // represents the match that occurred when a particular subset of an
 171 // ast::match -- those ast::matcher nodes leading to a single
 172 // match_nonterminal -- was applied to a particular token tree.
 173 //
 174 // The width of each matched_seq in the named_match, and the identity of the
 175 // matched_nonterminals, will depend on the token tree it was applied to: each
 176 // matched_seq corresponds to a single match_seq in the originating
 177 // ast::matcher. The depth of the named_match structure will therefore depend
 178 // only on the nesting depth of ast::match_seqs in the originating
 179 // ast::matcher it was derived from.
 180
 181 pub enum named_match {
 182     matched_seq(~[@named_match], codemap::span),
 183     matched_nonterminal(nonterminal)
 184 }
 185
 186 pub type earley_item = ~MatcherPos;
 187
 188 pub fn nameize(p_s: @mut ParseSess, ms: ~[matcher], res: ~[@named_match])
 189             -> LinearMap<ident,@named_match> {
 190     fn n_rec(p_s: @mut ParseSess, m: matcher, res: ~[@named_match],
 191              ret_val: &mut LinearMap<ident, @named_match>) {
 192         match m {
 193           codemap::spanned {node: match_tok(_), _} => (),
 194           codemap::spanned {node: match_seq(ref more_ms, _, _, _, _), _} => {
 195             for (*more_ms).each() |next_m| {
 196                 n_rec(p_s, *next_m, res, ret_val)
 197             };
 198           }
 199           codemap::spanned {
 200                 node: match_nonterminal(bind_name, _, idx), span: sp
 201           } => {
 202             if ret_val.contains_key(&bind_name) {
 203                 p_s.span_diagnostic.span_fatal(sp, ~"Duplicated bind name: "+
 204                                                *p_s.interner.get(bind_name))
 205             }
 206             ret_val.insert(bind_name, res[idx]);
 207           }
 208         }
 209     }
 210     let mut ret_val = LinearMap::new();
 211     for ms.each() |m| { n_rec(p_s, *m, res, &mut ret_val) }
 212     return ret_val;
 213 }
 214
 215 pub enum parse_result {
 216     success(LinearMap<ident, @named_match>),
 217     failure(codemap::span, ~str),
 218     error(codemap::span, ~str)
 219 }
 220
 221 pub fn parse_or_else(
 222     sess: @mut ParseSess,
 223     +cfg: ast::crate_cfg,
 224     rdr: @reader,
 225     ms: ~[matcher]
 226 ) -> LinearMap<ident, @named_match> {
 227     match parse(sess, cfg, rdr, ms) {
 228       success(m) => m,
 229       failure(sp, str) => sess.span_diagnostic.span_fatal(sp, str),
 230       error(sp, str) => sess.span_diagnostic.span_fatal(sp, str)
 231     }
 232 }
 233
 234 pub fn parse(
 235     sess: @mut ParseSess,
 236     cfg: ast::crate_cfg,
 237     rdr: @reader,
 238     ms: ~[matcher]
 239 ) -> parse_result {
 240     let mut cur_eis = ~[];
 241     cur_eis.push(initial_matcher_pos(copy ms, None, rdr.peek().sp.lo));
 242
 243     loop {
 244         let mut bb_eis = ~[]; // black-box parsed by parser.rs
 245         let mut next_eis = ~[]; // or proceed normally
 246         let mut eof_eis = ~[];
 247
 248         let TokenAndSpan {tok: tok, sp: sp} = rdr.peek();
 249
 250         /* we append new items to this while we go */
 251         while cur_eis.len() > 0u { /* for each Earley Item */
 252             let mut ei = cur_eis.pop();
 253
 254             let idx = ei.idx;
 255             let len = ei.elts.len();
 256
 257             /* at end of sequence */
 258             if idx >= len {
 259                 // can't move out of `match`es, so:
 260                 if is_some(ei.up) {
 261                     // hack: a matcher sequence is repeating iff it has a
 262                     // parent (the top level is just a container)
 263
 264
 265                     // disregard separator, try to go up
 266                     // (remove this condition to make trailing seps ok)
 267                     if idx == len {
 268                         // pop from the matcher position
 269
 270                         let mut new_pos = copy_up(ei.up);
 271
 272                         // update matches (the MBE "parse tree") by appending
 273                         // each tree as a subtree.
 274
 275                         // I bet this is a perf problem: we're preemptively
 276                         // doing a lot of array work that will get thrown away
 277                         // most of the time.
 278
 279                         // Only touch the binders we have actually bound
 280                         for uint::range(ei.match_lo, ei.match_hi) |idx| {
 281                             let sub = ei.matches[idx];
 282                             new_pos.matches[idx]
 283                                 .push(@matched_seq(sub,
 284                                                    mk_sp(ei.sp_lo,
 285                                                          sp.hi)));
 286                         }
 287
 288                         new_pos.idx += 1;
 289                         cur_eis.push(new_pos);
 290                     }
 291
 292                     // can we go around again?
 293
 294                     // the *_t vars are workarounds for the lack of unary move
 295                     match copy ei.sep {
 296                       Some(ref t) if idx == len => { // we need a separator
 297                         if tok == (*t) { //pass the separator
 298                             let mut ei_t = ei;
 299                             ei_t.idx += 1;
 300                             next_eis.push(ei_t);
 301                         }
 302                       }
 303                       _ => { // we don't need a separator
 304                         let mut ei_t = ei;
 305                         ei_t.idx = 0;
 306                         cur_eis.push(ei_t);
 307                       }
 308                     }
 309                 } else {
 310                     eof_eis.push(ei);
 311                 }
 312             } else {
 313                 match copy ei.elts[idx].node {
 314                   /* need to descend into sequence */
 315                   match_seq(ref matchers, ref sep, zero_ok,
 316                             match_idx_lo, match_idx_hi) => {
 317                     if zero_ok {
 318                         let mut new_ei = copy ei;
 319                         new_ei.idx += 1u;
 320                         //we specifically matched zero repeats.
 321                         for uint::range(match_idx_lo, match_idx_hi) |idx| {
 322                             new_ei.matches[idx].push(@matched_seq(~[], sp));
 323                         }
 324
 325                         cur_eis.push(new_ei);
 326                     }
 327
 328                     let matches = vec::map(ei.matches, // fresh, same size:
 329                                            |_m| ~[]);
 330                     let ei_t = ei;
 331                     cur_eis.push(~MatcherPos {
 332                         elts: copy *matchers,
 333                         sep: copy *sep,
 334                         idx: 0u,
 335                         up: matcher_pos_up(Some(ei_t)),
 336                         matches: matches,
 337                         match_lo: match_idx_lo, match_hi: match_idx_hi,
 338                         sp_lo: sp.lo
 339                     });
 340                   }
 341                   match_nonterminal(_,_,_) => { bb_eis.push(ei) }
 342                   match_tok(ref t) => {
 343                     let mut ei_t = ei;
 344                     if (*t) == tok {
 345                         ei_t.idx += 1;
 346                         next_eis.push(ei_t);
 347                     }
 348                   }
 349                 }
 350             }
 351         }
 352
 353         /* error messages here could be improved with links to orig. rules */
 354         if tok == EOF {
 355             if eof_eis.len() == 1u {
 356                 let mut v = ~[];
 357                 for vec::each_mut(eof_eis[0u].matches) |dv| {
 358                     v.push(dv.pop());
 359                 }
 360                 return success(nameize(sess, ms, v));
 361             } else if eof_eis.len() > 1u {
 362                 return error(sp, ~"Ambiguity: multiple successful parses");
 363             } else {
 364                 return failure(sp, ~"Unexpected end of macro invocation");
 365             }
 366         } else {
 367             if (bb_eis.len() > 0u && next_eis.len() > 0u)
 368                 || bb_eis.len() > 1u {
 369                 let nts = str::connect(vec::map(bb_eis, |ei| {
 370                     match ei.elts[ei.idx].node {
 371                       match_nonterminal(bind,name,_) => {
 372                         fmt!("%s ('%s')", *sess.interner.get(name),
 373                              *sess.interner.get(bind))
 374                       }
 375                       _ => fail!()
 376                     } }), ~" or ");
 377                 return error(sp, fmt!(
 378                     "Local ambiguity: multiple parsing options: \
 379                      built-in NTs %s or %u other options.",
 380                     nts, next_eis.len()));
 381             } else if (bb_eis.len() == 0u && next_eis.len() == 0u) {
 382                 return failure(sp, ~"No rules expected the token: "
 383                             + to_str(rdr.interner(), &tok));
 384             } else if (next_eis.len() > 0u) {
 385                 /* Now process the next token */
 386                 while(next_eis.len() > 0u) {
 387                     cur_eis.push(next_eis.pop());
 388                 }
 389                 rdr.next_token();
 390             } else /* bb_eis.len() == 1 */ {
 391                 let rust_parser = Parser(sess, copy cfg, rdr.dup());
 392
 393                 let mut ei = bb_eis.pop();
 394                 match ei.elts[ei.idx].node {
 395                   match_nonterminal(_, name, idx) => {
 396                     ei.matches[idx].push(@matched_nonterminal(
 397                         parse_nt(rust_parser, *sess.interner.get(name))));
 398                     ei.idx += 1u;
 399                   }
 400                   _ => fail!()
 401                 }
 402                 cur_eis.push(ei);
 403
 404                 for rust_parser.tokens_consumed.times() || {
 405                     rdr.next_token();
 406                 }
 407             }
 408         }
 409
 410         assert!(cur_eis.len() > 0u);
 411     }
 412 }
 413
 414 pub fn parse_nt(p: Parser, name: ~str) -> nonterminal {
 415     match name {
 416       ~"item" => match p.parse_item(~[]) {
 417         Some(i) => token::nt_item(i),
 418         None => p.fatal(~"expected an item keyword")
 419       },
 420       ~"block" => token::nt_block(p.parse_block()),
 421       ~"stmt" => token::nt_stmt(p.parse_stmt(~[])),
 422       ~"pat" => token::nt_pat(p.parse_pat(true)),
 423       ~"expr" => token::nt_expr(p.parse_expr()),
 424       ~"ty" => token::nt_ty(p.parse_ty(false /* no need to disambiguate*/)),
 425       // this could be handled like a token, since it is one
 426       ~"ident" => match *p.token {
 427         token::IDENT(sn,b) => { p.bump(); token::nt_ident(sn,b) }
 428         _ => p.fatal(~"expected ident, found "
 429                      + token::to_str(p.reader.interner(), &copy *p.token))
 430       },
 431       ~"path" => token::nt_path(p.parse_path_with_tps(false)),
 432       ~"tt" => {
 433         *p.quote_depth += 1u; //but in theory, non-quoted tts might be useful
 434         let res = token::nt_tt(@p.parse_token_tree());
 435         *p.quote_depth -= 1u;
 436         res
 437       }
 438       ~"matchers" => token::nt_matchers(p.parse_matchers()),
 439       _ => p.fatal(~"Unsupported builtin nonterminal parser: " + name)
 440     }
 441 }
 442
 443 // Local Variables:
 444 // mode: rust;
 445 // fill-column: 78;
 446 // indent-tabs-mode: nil
 447 // c-basic-offset: 4
 448 // buffer-file-coding-system: utf-8-unix
 449 // End: