]> git.proxmox.com Git - rustc.git/blob - src/libsyntax/ext/tt/macro_parser.rs
Imported Upstream version 0.6
[rustc.git] / src / libsyntax / ext / tt / macro_parser.rs
1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 // Earley-like parser for macros.
12 use ast;
13 use ast::{matcher, match_tok, match_seq, match_nonterminal, ident};
14 use codemap::{BytePos, mk_sp};
15 use codemap;
16 use parse::lexer::*; //resolve bug?
17 use parse::ParseSess;
18 use parse::parser::Parser;
19 use parse::token::{Token, EOF, to_str, nonterminal};
20 use parse::token;
21
22 use core::prelude::*;
23 use core::hashmap::linear::LinearMap;
24
25 /* This is an Earley-like parser, without support for in-grammar nonterminals,
26 only by calling out to the main rust parser for named nonterminals (which it
27 commits to fully when it hits one in a grammar). This means that there are no
28 completer or predictor rules, and therefore no need to store one column per
29 token: instead, there's a set of current Earley items and a set of next
30 ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
31 pathological cases, is worse than traditional Earley parsing, but it's an
32 easier fit for Macro-by-Example-style rules, and I think the overhead is
33 lower. (In order to prevent the pathological case, we'd need to lazily
34 construct the resulting `named_match`es at the very end. It'd be a pain,
35 and require more memory to keep around old items, but it would also save
36 overhead)*/
37
38 /* Quick intro to how the parser works:
39
40 A 'position' is a dot in the middle of a matcher, usually represented as a
41 dot. For example `· a $( a )* a b` is a position, as is `a $( · a )* a b`.
42
43 The parser walks through the input a character at a time, maintaining a list
44 of items consistent with the current position in the input string: `cur_eis`.
45
46 As it processes them, it fills up `eof_eis` with items that would be valid if
47 the macro invocation is now over, `bb_eis` with items that are waiting on
48 a Rust nonterminal like `$e:expr`, and `next_eis` with items that are waiting
49 on the a particular token. Most of the logic concerns moving the · through the
50 repetitions indicated by Kleene stars. It only advances or calls out to the
51 real Rust parser when no `cur_eis` items remain
52
53 Example: Start parsing `a a a a b` against [· a $( a )* a b].
54
55 Remaining input: `a a a a b`
56 next_eis: [· a $( a )* a b]
57
58 - - - Advance over an `a`. - - -
59
60 Remaining input: `a a a b`
61 cur: [a · $( a )* a b]
62 Descend/Skip (first item).
63 next: [a $( · a )* a b] [a $( a )* · a b].
64
65 - - - Advance over an `a`. - - -
66
67 Remaining input: `a a b`
68 cur: [a $( a · )* a b] next: [a $( a )* a · b]
69 Finish/Repeat (first item)
70 next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
71
72 - - - Advance over an `a`. - - - (this looks exactly like the last step)
73
74 Remaining input: `a b`
75 cur: [a $( a · )* a b] next: [a $( a )* a · b]
76 Finish/Repeat (first item)
77 next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
78
79 - - - Advance over an `a`. - - - (this looks exactly like the last step)
80
81 Remaining input: `b`
82 cur: [a $( a · )* a b] next: [a $( a )* a · b]
83 Finish/Repeat (first item)
84 next: [a $( a )* · a b] [a $( · a )* a b]
85
86 - - - Advance over a `b`. - - -
87
88 Remaining input: ``
89 eof: [a $( a )* a b ·]
90
91 */
92
93
94 /* to avoid costly uniqueness checks, we require that `match_seq` always has a
95 nonempty body. */
96
97 pub enum matcher_pos_up { /* to break a circularity */
98 matcher_pos_up(Option<~MatcherPos>)
99 }
100
101 pub fn is_some(&&mpu: matcher_pos_up) -> bool {
102 match &mpu {
103 &matcher_pos_up(None) => false,
104 _ => true
105 }
106 }
107
108 pub struct MatcherPos {
109 elts: ~[ast::matcher], // maybe should be <'>? Need to understand regions.
110 sep: Option<Token>,
111 idx: uint,
112 up: matcher_pos_up, // mutable for swapping only
113 matches: ~[~[@named_match]],
114 match_lo: uint, match_hi: uint,
115 sp_lo: BytePos,
116 }
117
118 pub fn copy_up(&& mpu: matcher_pos_up) -> ~MatcherPos {
119 match &mpu {
120 &matcher_pos_up(Some(ref mp)) => copy (*mp),
121 _ => fail!()
122 }
123 }
124
125 pub fn count_names(ms: &[matcher]) -> uint {
126 vec::foldl(0u, ms, |ct, m| {
127 ct + match m.node {
128 match_tok(_) => 0u,
129 match_seq(ref more_ms, _, _, _, _) => count_names((*more_ms)),
130 match_nonterminal(_,_,_) => 1u
131 }})
132 }
133
134 #[allow(non_implicitly_copyable_typarams)]
135 pub fn initial_matcher_pos(+ms: ~[matcher], +sep: Option<Token>, lo: BytePos)
136 -> ~MatcherPos {
137 let mut match_idx_hi = 0u;
138 for ms.each |elt| {
139 match elt.node {
140 match_tok(_) => (),
141 match_seq(_,_,_,_,hi) => {
142 match_idx_hi = hi; // it is monotonic...
143 }
144 match_nonterminal(_,_,pos) => {
145 match_idx_hi = pos+1u; // ...so latest is highest
146 }
147 }
148 }
149 let matches = vec::from_fn(count_names(ms), |_i| ~[]);
150 ~MatcherPos {
151 elts: ms,
152 sep: sep,
153 idx: 0u,
154 up: matcher_pos_up(None),
155 matches: matches,
156 match_lo: 0u,
157 match_hi: match_idx_hi,
158 sp_lo: lo
159 }
160 }
161
162 // named_match is a pattern-match result for a single ast::match_nonterminal:
163 // so it is associated with a single ident in a parse, and all
164 // matched_nonterminals in the named_match have the same nonterminal type
165 // (expr, item, etc). All the leaves in a single named_match correspond to a
166 // single matcher_nonterminal in the ast::matcher that produced it.
167 //
168 // It should probably be renamed, it has more or less exact correspondence to
169 // ast::match nodes, and the in-memory structure of a particular named_match
170 // represents the match that occurred when a particular subset of an
171 // ast::match -- those ast::matcher nodes leading to a single
172 // match_nonterminal -- was applied to a particular token tree.
173 //
174 // The width of each matched_seq in the named_match, and the identity of the
175 // matched_nonterminals, will depend on the token tree it was applied to: each
176 // matched_seq corresponds to a single match_seq in the originating
177 // ast::matcher. The depth of the named_match structure will therefore depend
178 // only on the nesting depth of ast::match_seqs in the originating
179 // ast::matcher it was derived from.
180
181 pub enum named_match {
182 matched_seq(~[@named_match], codemap::span),
183 matched_nonterminal(nonterminal)
184 }
185
186 pub type earley_item = ~MatcherPos;
187
188 pub fn nameize(p_s: @mut ParseSess, ms: ~[matcher], res: ~[@named_match])
189 -> LinearMap<ident,@named_match> {
190 fn n_rec(p_s: @mut ParseSess, m: matcher, res: ~[@named_match],
191 ret_val: &mut LinearMap<ident, @named_match>) {
192 match m {
193 codemap::spanned {node: match_tok(_), _} => (),
194 codemap::spanned {node: match_seq(ref more_ms, _, _, _, _), _} => {
195 for (*more_ms).each() |next_m| {
196 n_rec(p_s, *next_m, res, ret_val)
197 };
198 }
199 codemap::spanned {
200 node: match_nonterminal(bind_name, _, idx), span: sp
201 } => {
202 if ret_val.contains_key(&bind_name) {
203 p_s.span_diagnostic.span_fatal(sp, ~"Duplicated bind name: "+
204 *p_s.interner.get(bind_name))
205 }
206 ret_val.insert(bind_name, res[idx]);
207 }
208 }
209 }
210 let mut ret_val = LinearMap::new();
211 for ms.each() |m| { n_rec(p_s, *m, res, &mut ret_val) }
212 return ret_val;
213 }
214
215 pub enum parse_result {
216 success(LinearMap<ident, @named_match>),
217 failure(codemap::span, ~str),
218 error(codemap::span, ~str)
219 }
220
221 pub fn parse_or_else(
222 sess: @mut ParseSess,
223 +cfg: ast::crate_cfg,
224 rdr: @reader,
225 ms: ~[matcher]
226 ) -> LinearMap<ident, @named_match> {
227 match parse(sess, cfg, rdr, ms) {
228 success(m) => m,
229 failure(sp, str) => sess.span_diagnostic.span_fatal(sp, str),
230 error(sp, str) => sess.span_diagnostic.span_fatal(sp, str)
231 }
232 }
233
234 pub fn parse(
235 sess: @mut ParseSess,
236 cfg: ast::crate_cfg,
237 rdr: @reader,
238 ms: ~[matcher]
239 ) -> parse_result {
240 let mut cur_eis = ~[];
241 cur_eis.push(initial_matcher_pos(copy ms, None, rdr.peek().sp.lo));
242
243 loop {
244 let mut bb_eis = ~[]; // black-box parsed by parser.rs
245 let mut next_eis = ~[]; // or proceed normally
246 let mut eof_eis = ~[];
247
248 let TokenAndSpan {tok: tok, sp: sp} = rdr.peek();
249
250 /* we append new items to this while we go */
251 while cur_eis.len() > 0u { /* for each Earley Item */
252 let mut ei = cur_eis.pop();
253
254 let idx = ei.idx;
255 let len = ei.elts.len();
256
257 /* at end of sequence */
258 if idx >= len {
259 // can't move out of `match`es, so:
260 if is_some(ei.up) {
261 // hack: a matcher sequence is repeating iff it has a
262 // parent (the top level is just a container)
263
264
265 // disregard separator, try to go up
266 // (remove this condition to make trailing seps ok)
267 if idx == len {
268 // pop from the matcher position
269
270 let mut new_pos = copy_up(ei.up);
271
272 // update matches (the MBE "parse tree") by appending
273 // each tree as a subtree.
274
275 // I bet this is a perf problem: we're preemptively
276 // doing a lot of array work that will get thrown away
277 // most of the time.
278
279 // Only touch the binders we have actually bound
280 for uint::range(ei.match_lo, ei.match_hi) |idx| {
281 let sub = ei.matches[idx];
282 new_pos.matches[idx]
283 .push(@matched_seq(sub,
284 mk_sp(ei.sp_lo,
285 sp.hi)));
286 }
287
288 new_pos.idx += 1;
289 cur_eis.push(new_pos);
290 }
291
292 // can we go around again?
293
294 // the *_t vars are workarounds for the lack of unary move
295 match copy ei.sep {
296 Some(ref t) if idx == len => { // we need a separator
297 if tok == (*t) { //pass the separator
298 let mut ei_t = ei;
299 ei_t.idx += 1;
300 next_eis.push(ei_t);
301 }
302 }
303 _ => { // we don't need a separator
304 let mut ei_t = ei;
305 ei_t.idx = 0;
306 cur_eis.push(ei_t);
307 }
308 }
309 } else {
310 eof_eis.push(ei);
311 }
312 } else {
313 match copy ei.elts[idx].node {
314 /* need to descend into sequence */
315 match_seq(ref matchers, ref sep, zero_ok,
316 match_idx_lo, match_idx_hi) => {
317 if zero_ok {
318 let mut new_ei = copy ei;
319 new_ei.idx += 1u;
320 //we specifically matched zero repeats.
321 for uint::range(match_idx_lo, match_idx_hi) |idx| {
322 new_ei.matches[idx].push(@matched_seq(~[], sp));
323 }
324
325 cur_eis.push(new_ei);
326 }
327
328 let matches = vec::map(ei.matches, // fresh, same size:
329 |_m| ~[]);
330 let ei_t = ei;
331 cur_eis.push(~MatcherPos {
332 elts: copy *matchers,
333 sep: copy *sep,
334 idx: 0u,
335 up: matcher_pos_up(Some(ei_t)),
336 matches: matches,
337 match_lo: match_idx_lo, match_hi: match_idx_hi,
338 sp_lo: sp.lo
339 });
340 }
341 match_nonterminal(_,_,_) => { bb_eis.push(ei) }
342 match_tok(ref t) => {
343 let mut ei_t = ei;
344 if (*t) == tok {
345 ei_t.idx += 1;
346 next_eis.push(ei_t);
347 }
348 }
349 }
350 }
351 }
352
353 /* error messages here could be improved with links to orig. rules */
354 if tok == EOF {
355 if eof_eis.len() == 1u {
356 let mut v = ~[];
357 for vec::each_mut(eof_eis[0u].matches) |dv| {
358 v.push(dv.pop());
359 }
360 return success(nameize(sess, ms, v));
361 } else if eof_eis.len() > 1u {
362 return error(sp, ~"Ambiguity: multiple successful parses");
363 } else {
364 return failure(sp, ~"Unexpected end of macro invocation");
365 }
366 } else {
367 if (bb_eis.len() > 0u && next_eis.len() > 0u)
368 || bb_eis.len() > 1u {
369 let nts = str::connect(vec::map(bb_eis, |ei| {
370 match ei.elts[ei.idx].node {
371 match_nonterminal(bind,name,_) => {
372 fmt!("%s ('%s')", *sess.interner.get(name),
373 *sess.interner.get(bind))
374 }
375 _ => fail!()
376 } }), ~" or ");
377 return error(sp, fmt!(
378 "Local ambiguity: multiple parsing options: \
379 built-in NTs %s or %u other options.",
380 nts, next_eis.len()));
381 } else if (bb_eis.len() == 0u && next_eis.len() == 0u) {
382 return failure(sp, ~"No rules expected the token: "
383 + to_str(rdr.interner(), &tok));
384 } else if (next_eis.len() > 0u) {
385 /* Now process the next token */
386 while(next_eis.len() > 0u) {
387 cur_eis.push(next_eis.pop());
388 }
389 rdr.next_token();
390 } else /* bb_eis.len() == 1 */ {
391 let rust_parser = Parser(sess, copy cfg, rdr.dup());
392
393 let mut ei = bb_eis.pop();
394 match ei.elts[ei.idx].node {
395 match_nonterminal(_, name, idx) => {
396 ei.matches[idx].push(@matched_nonterminal(
397 parse_nt(rust_parser, *sess.interner.get(name))));
398 ei.idx += 1u;
399 }
400 _ => fail!()
401 }
402 cur_eis.push(ei);
403
404 for rust_parser.tokens_consumed.times() || {
405 rdr.next_token();
406 }
407 }
408 }
409
410 assert!(cur_eis.len() > 0u);
411 }
412 }
413
414 pub fn parse_nt(p: Parser, name: ~str) -> nonterminal {
415 match name {
416 ~"item" => match p.parse_item(~[]) {
417 Some(i) => token::nt_item(i),
418 None => p.fatal(~"expected an item keyword")
419 },
420 ~"block" => token::nt_block(p.parse_block()),
421 ~"stmt" => token::nt_stmt(p.parse_stmt(~[])),
422 ~"pat" => token::nt_pat(p.parse_pat(true)),
423 ~"expr" => token::nt_expr(p.parse_expr()),
424 ~"ty" => token::nt_ty(p.parse_ty(false /* no need to disambiguate*/)),
425 // this could be handled like a token, since it is one
426 ~"ident" => match *p.token {
427 token::IDENT(sn,b) => { p.bump(); token::nt_ident(sn,b) }
428 _ => p.fatal(~"expected ident, found "
429 + token::to_str(p.reader.interner(), &copy *p.token))
430 },
431 ~"path" => token::nt_path(p.parse_path_with_tps(false)),
432 ~"tt" => {
433 *p.quote_depth += 1u; //but in theory, non-quoted tts might be useful
434 let res = token::nt_tt(@p.parse_token_tree());
435 *p.quote_depth -= 1u;
436 res
437 }
438 ~"matchers" => token::nt_matchers(p.parse_matchers()),
439 _ => p.fatal(~"Unsupported builtin nonterminal parser: " + name)
440 }
441 }
442
443 // Local Variables:
444 // mode: rust;
445 // fill-column: 78;
446 // indent-tabs-mode: nil
447 // c-basic-offset: 4
448 // buffer-file-coding-system: utf-8-unix
449 // End: