1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // Earley-like parser for macros.
13 use ast
::{matcher, match_tok, match_seq, match_nonterminal, ident}
;
14 use codemap
::{BytePos, mk_sp}
;
16 use parse
::lexer
::*; //resolve bug?
18 use parse
::parser
::Parser
;
19 use parse
::token
::{Token, EOF, to_str, nonterminal}
;
23 use core
::hashmap
::linear
::LinearMap
;
25 /* This is an Earley-like parser, without support for in-grammar nonterminals,
26 only by calling out to the main rust parser for named nonterminals (which it
27 commits to fully when it hits one in a grammar). This means that there are no
28 completer or predictor rules, and therefore no need to store one column per
29 token: instead, there's a set of current Earley items and a set of next
30 ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
31 pathological cases, is worse than traditional Earley parsing, but it's an
32 easier fit for Macro-by-Example-style rules, and I think the overhead is
33 lower. (In order to prevent the pathological case, we'd need to lazily
34 construct the resulting `named_match`es at the very end. It'd be a pain,
35 and require more memory to keep around old items, but it would also save
38 /* Quick intro to how the parser works:
40 A 'position' is a dot in the middle of a matcher, usually represented as a
41 dot. For example `· a $( a )* a b` is a position, as is `a $( · a )* a b`.
43 The parser walks through the input a character at a time, maintaining a list
44 of items consistent with the current position in the input string: `cur_eis`.
46 As it processes them, it fills up `eof_eis` with items that would be valid if
47 the macro invocation is now over, `bb_eis` with items that are waiting on
48 a Rust nonterminal like `$e:expr`, and `next_eis` with items that are waiting
49 on the a particular token. Most of the logic concerns moving the · through the
50 repetitions indicated by Kleene stars. It only advances or calls out to the
51 real Rust parser when no `cur_eis` items remain
53 Example: Start parsing `a a a a b` against [· a $( a )* a b].
55 Remaining input: `a a a a b`
56 next_eis: [· a $( a )* a b]
58 - - - Advance over an `a`. - - -
60 Remaining input: `a a a b`
61 cur: [a · $( a )* a b]
62 Descend/Skip (first item).
63 next: [a $( · a )* a b] [a $( a )* · a b].
65 - - - Advance over an `a`. - - -
67 Remaining input: `a a b`
68 cur: [a $( a · )* a b] next: [a $( a )* a · b]
69 Finish/Repeat (first item)
70 next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
72 - - - Advance over an `a`. - - - (this looks exactly like the last step)
74 Remaining input: `a b`
75 cur: [a $( a · )* a b] next: [a $( a )* a · b]
76 Finish/Repeat (first item)
77 next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
79 - - - Advance over an `a`. - - - (this looks exactly like the last step)
82 cur: [a $( a · )* a b] next: [a $( a )* a · b]
83 Finish/Repeat (first item)
84 next: [a $( a )* · a b] [a $( · a )* a b]
86 - - - Advance over a `b`. - - -
89 eof: [a $( a )* a b ·]
94 /* to avoid costly uniqueness checks, we require that `match_seq` always has a
97 pub enum matcher_pos_up
{ /* to break a circularity */
98 matcher_pos_up(Option
<~MatcherPos
>)
101 pub fn is_some(&&mpu
: matcher_pos_up
) -> bool
{
103 &matcher_pos_up(None
) => false,
108 pub struct MatcherPos
{
109 elts
: ~[ast
::matcher
], // maybe should be <'>? Need to understand regions.
112 up
: matcher_pos_up
, // mutable for swapping only
113 matches
: ~[~[@named_match
]],
114 match_lo
: uint
, match_hi
: uint
,
118 pub fn copy_up(&& mpu
: matcher_pos_up
) -> ~MatcherPos
{
120 &matcher_pos_up(Some(ref mp
)) => copy (*mp
),
125 pub fn count_names(ms
: &[matcher
]) -> uint
{
126 vec
::foldl(0u, ms
, |ct
, m
| {
129 match_seq(ref more_ms
, _
, _
, _
, _
) => count_names((*more_ms
)),
130 match_nonterminal(_
,_
,_
) => 1u
134 #[allow(non_implicitly_copyable_typarams)]
135 pub fn initial_matcher_pos(+ms
: ~[matcher
], +sep
: Option
<Token
>, lo
: BytePos
)
137 let mut match_idx_hi
= 0u;
141 match_seq(_
,_
,_
,_
,hi
) => {
142 match_idx_hi
= hi
; // it is monotonic...
144 match_nonterminal(_
,_
,pos
) => {
145 match_idx_hi
= pos
+1u; // ...so latest is highest
149 let matches
= vec
::from_fn(count_names(ms
), |_i
| ~[]);
154 up
: matcher_pos_up(None
),
157 match_hi
: match_idx_hi
,
162 // named_match is a pattern-match result for a single ast::match_nonterminal:
163 // so it is associated with a single ident in a parse, and all
164 // matched_nonterminals in the named_match have the same nonterminal type
165 // (expr, item, etc). All the leaves in a single named_match correspond to a
166 // single matcher_nonterminal in the ast::matcher that produced it.
168 // It should probably be renamed, it has more or less exact correspondence to
169 // ast::match nodes, and the in-memory structure of a particular named_match
170 // represents the match that occurred when a particular subset of an
171 // ast::match -- those ast::matcher nodes leading to a single
172 // match_nonterminal -- was applied to a particular token tree.
174 // The width of each matched_seq in the named_match, and the identity of the
175 // matched_nonterminals, will depend on the token tree it was applied to: each
176 // matched_seq corresponds to a single match_seq in the originating
177 // ast::matcher. The depth of the named_match structure will therefore depend
178 // only on the nesting depth of ast::match_seqs in the originating
179 // ast::matcher it was derived from.
181 pub enum named_match
{
182 matched_seq(~[@named_match
], codemap
::span
),
183 matched_nonterminal(nonterminal
)
186 pub type earley_item
= ~MatcherPos
;
188 pub fn nameize(p_s
: @
mut ParseSess
, ms
: ~[matcher
], res
: ~[@named_match
])
189 -> LinearMap
<ident
,@named_match
> {
190 fn n_rec(p_s
: @
mut ParseSess
, m
: matcher
, res
: ~[@named_match
],
191 ret_val
: &mut LinearMap
<ident
, @named_match
>) {
193 codemap
::spanned {node: match_tok(_), _}
=> (),
194 codemap
::spanned {node: match_seq(ref more_ms, _, _, _, _), _}
=> {
195 for (*more_ms
).each() |next_m
| {
196 n_rec(p_s
, *next_m
, res
, ret_val
)
200 node
: match_nonterminal(bind_name
, _
, idx
), span
: sp
202 if ret_val
.contains_key(&bind_name
) {
203 p_s
.span_diagnostic
.span_fatal(sp
, ~"Duplicated bind name: "+
204 *p_s
.interner
.get(bind_name
))
206 ret_val
.insert(bind_name
, res
[idx
]);
210 let mut ret_val
= LinearMap
::new();
211 for ms
.each() |m
| { n_rec(p_s, *m, res, &mut ret_val) }
215 pub enum parse_result
{
216 success(LinearMap
<ident
, @named_match
>),
217 failure(codemap
::span
, ~str),
218 error(codemap
::span
, ~str)
221 pub fn parse_or_else(
222 sess
: @
mut ParseSess
,
223 +cfg
: ast
::crate_cfg
,
226 ) -> LinearMap
<ident
, @named_match
> {
227 match parse(sess
, cfg
, rdr
, ms
) {
229 failure(sp
, str) => sess
.span_diagnostic
.span_fatal(sp
, str),
230 error(sp
, str) => sess
.span_diagnostic
.span_fatal(sp
, str)
235 sess
: @
mut ParseSess
,
240 let mut cur_eis
= ~[];
241 cur_eis
.push(initial_matcher_pos(copy ms
, None
, rdr
.peek().sp
.lo
));
244 let mut bb_eis
= ~[]; // black-box parsed by parser.rs
245 let mut next_eis
= ~[]; // or proceed normally
246 let mut eof_eis
= ~[];
248 let TokenAndSpan {tok: tok, sp: sp}
= rdr
.peek();
250 /* we append new items to this while we go */
251 while cur_eis
.len() > 0u { /* for each Earley Item */
252 let mut ei
= cur_eis
.pop();
255 let len
= ei
.elts
.len();
257 /* at end of sequence */
259 // can't move out of `match`es, so:
261 // hack: a matcher sequence is repeating iff it has a
262 // parent (the top level is just a container)
265 // disregard separator, try to go up
266 // (remove this condition to make trailing seps ok)
268 // pop from the matcher position
270 let mut new_pos
= copy_up(ei
.up
);
272 // update matches (the MBE "parse tree") by appending
273 // each tree as a subtree.
275 // I bet this is a perf problem: we're preemptively
276 // doing a lot of array work that will get thrown away
279 // Only touch the binders we have actually bound
280 for uint
::range(ei
.match_lo
, ei
.match_hi
) |idx
| {
281 let sub
= ei
.matches
[idx
];
283 .push(@
matched_seq(sub
,
289 cur_eis
.push(new_pos
);
292 // can we go around again?
294 // the *_t vars are workarounds for the lack of unary move
296 Some(ref t
) if idx
== len
=> { // we need a separator
297 if tok
== (*t
) { //pass the separator
303 _
=> { // we don't need a separator
313 match copy ei
.elts
[idx
].node
{
314 /* need to descend into sequence */
315 match_seq(ref matchers
, ref sep
, zero_ok
,
316 match_idx_lo
, match_idx_hi
) => {
318 let mut new_ei
= copy ei
;
320 //we specifically matched zero repeats.
321 for uint
::range(match_idx_lo
, match_idx_hi
) |idx
| {
322 new_ei
.matches
[idx
].push(@
matched_seq(~[], sp
));
325 cur_eis
.push(new_ei
);
328 let matches
= vec
::map(ei
.matches
, // fresh, same size:
331 cur_eis
.push(~MatcherPos
{
332 elts
: copy
*matchers
,
335 up
: matcher_pos_up(Some(ei_t
)),
337 match_lo
: match_idx_lo
, match_hi
: match_idx_hi
,
341 match_nonterminal(_
,_
,_
) => { bb_eis.push(ei) }
342 match_tok(ref t
) => {
353 /* error messages here could be improved with links to orig. rules */
355 if eof_eis
.len() == 1u {
357 for vec
::each_mut(eof_eis
[0u].matches
) |dv
| {
360 return success(nameize(sess
, ms
, v
));
361 } else if eof_eis
.len() > 1u {
362 return error(sp
, ~"Ambiguity: multiple successful parses");
364 return failure(sp
, ~"Unexpected end of macro invocation");
367 if (bb_eis
.len() > 0u && next_eis
.len() > 0u)
368 || bb_eis
.len() > 1u {
369 let nts
= str::connect(vec
::map(bb_eis
, |ei
| {
370 match ei
.elts
[ei
.idx
].node
{
371 match_nonterminal(bind
,name
,_
) => {
372 fmt
!("%s ('%s')", *sess
.interner
.get(name
),
373 *sess
.interner
.get(bind
))
377 return error(sp
, fmt
!(
378 "Local ambiguity: multiple parsing options: \
379 built-in NTs %s or %u other options.",
380 nts
, next_eis
.len()));
381 } else if (bb_eis
.len() == 0u && next_eis
.len() == 0u) {
382 return failure(sp
, ~"No rules expected the token: "
383 + to_str(rdr
.interner(), &tok
));
384 } else if (next_eis
.len() > 0u) {
385 /* Now process the next token */
386 while(next_eis
.len() > 0u) {
387 cur_eis
.push(next_eis
.pop());
390 } else /* bb_eis.len() == 1 */ {
391 let rust_parser
= Parser(sess
, copy cfg
, rdr
.dup());
393 let mut ei
= bb_eis
.pop();
394 match ei
.elts
[ei
.idx
].node
{
395 match_nonterminal(_
, name
, idx
) => {
396 ei
.matches
[idx
].push(@
matched_nonterminal(
397 parse_nt(rust_parser
, *sess
.interner
.get(name
))));
404 for rust_parser
.tokens_consumed
.times() || {
410 assert
!(cur_eis
.len() > 0u);
414 pub fn parse_nt(p
: Parser
, name
: ~str) -> nonterminal
{
416 ~"item" => match p
.parse_item(~[]) {
417 Some(i
) => token
::nt_item(i
),
418 None
=> p
.fatal(~"expected an item keyword")
420 ~"block" => token
::nt_block(p
.parse_block()),
421 ~"stmt" => token
::nt_stmt(p
.parse_stmt(~[])),
422 ~"pat" => token
::nt_pat(p
.parse_pat(true)),
423 ~"expr" => token
::nt_expr(p
.parse_expr()),
424 ~"ty" => token
::nt_ty(p
.parse_ty(false /* no need to disambiguate*/)),
425 // this could be handled like a token, since it is one
426 ~"ident" => match *p
.token
{
427 token
::IDENT(sn
,b
) => { p.bump(); token::nt_ident(sn,b) }
428 _
=> p
.fatal(~"expected ident, found "
429 + token
::to_str(p
.reader
.interner(), ©
*p
.token
))
431 ~"path" => token
::nt_path(p
.parse_path_with_tps(false)),
433 *p
.quote_depth
+= 1u; //but in theory, non-quoted tts might be useful
434 let res
= token
::nt_tt(@p
.parse_token_tree());
435 *p
.quote_depth
-= 1u;
438 ~"matchers" => token
::nt_matchers(p
.parse_matchers()),
439 _
=> p
.fatal(~"Unsupported builtin nonterminal parser: " + name
)
446 // indent-tabs-mode: nil
448 // buffer-file-coding-system: utf-8-unix