]> git.proxmox.com Git - rustc.git/blame - src/libsyntax/parse/lexer/mod.rs
Imported Upstream version 1.9.0+dfsg1
[rustc.git] / src / libsyntax / parse / lexer / mod.rs
CommitLineData
1a4d82fc
JJ
1// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use ast;
12use codemap::{BytePos, CharPos, CodeMap, Pos, Span};
13use codemap;
9cc50fc6 14use errors::{FatalError, Handler, DiagnosticBuilder};
1a4d82fc 15use ext::tt::transcribe::tt_next_token;
c34b1796 16use parse::token::str_to_ident;
d9579d0f
AL
17use parse::token;
18use str::char_at;
54a0048b 19use rustc_unicode::property::Pattern_White_Space;
1a4d82fc 20
d9579d0f 21use std::borrow::Cow;
1a4d82fc 22use std::char;
1a4d82fc 23use std::mem::replace;
1a4d82fc 24use std::rc::Rc;
1a4d82fc
JJ
25
26pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};
27
28pub mod comments;
92a42be0 29mod unicode_chars;
1a4d82fc
JJ
30
31pub trait Reader {
32 fn is_eof(&self) -> bool;
33 fn next_token(&mut self) -> TokenAndSpan;
34 /// Report a fatal error with the current span.
92a42be0 35 fn fatal(&self, &str) -> FatalError;
1a4d82fc
JJ
36 /// Report a non-fatal error with the current span.
37 fn err(&self, &str);
38 fn peek(&self) -> TokenAndSpan;
39 /// Get a token the parser cares about.
40 fn real_token(&mut self) -> TokenAndSpan {
41 let mut t = self.next_token();
42 loop {
43 match t.tok {
44 token::Whitespace | token::Comment | token::Shebang(_) => {
45 t = self.next_token();
9cc50fc6
SL
46 }
47 _ => break,
1a4d82fc
JJ
48 }
49 }
50 t
51 }
52}
53
85aaf69f 54#[derive(Clone, PartialEq, Eq, Debug)]
1a4d82fc
JJ
55pub struct TokenAndSpan {
56 pub tok: token::Token,
57 pub sp: Span,
58}
59
60pub struct StringReader<'a> {
9cc50fc6 61 pub span_diagnostic: &'a Handler,
1a4d82fc
JJ
62 /// The absolute offset within the codemap of the next character to read
63 pub pos: BytePos,
64 /// The absolute offset within the codemap of the last character read(curr)
65 pub last_pos: BytePos,
66 /// The column of the next character to read
67 pub col: CharPos,
68 /// The last character to be read
69 pub curr: Option<char>,
70 pub filemap: Rc<codemap::FileMap>,
9cc50fc6 71 // cached:
1a4d82fc
JJ
72 pub peek_tok: token::Token,
73 pub peek_span: Span,
74
c34b1796
AL
75 // cache a direct reference to the source text, so that we don't have to
76 // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
9cc50fc6 77 source_text: Rc<String>,
1a4d82fc
JJ
78}
79
80impl<'a> Reader for StringReader<'a> {
9cc50fc6
SL
81 fn is_eof(&self) -> bool {
82 self.curr.is_none()
83 }
1a4d82fc
JJ
84 /// Return the next token. EFFECT: advances the string_reader.
85 fn next_token(&mut self) -> TokenAndSpan {
86 let ret_val = TokenAndSpan {
87 tok: replace(&mut self.peek_tok, token::Underscore),
88 sp: self.peek_span,
89 };
90 self.advance_token();
91 ret_val
92 }
92a42be0 93 fn fatal(&self, m: &str) -> FatalError {
1a4d82fc
JJ
94 self.fatal_span(self.peek_span, m)
95 }
96 fn err(&self, m: &str) {
97 self.err_span(self.peek_span, m)
98 }
99 fn peek(&self) -> TokenAndSpan {
100 // FIXME(pcwalton): Bad copy!
101 TokenAndSpan {
102 tok: self.peek_tok.clone(),
103 sp: self.peek_span,
104 }
105 }
106}
107
108impl<'a> Reader for TtReader<'a> {
109 fn is_eof(&self) -> bool {
110 self.cur_tok == token::Eof
111 }
112 fn next_token(&mut self) -> TokenAndSpan {
113 let r = tt_next_token(self);
114 debug!("TtReader: r={:?}", r);
115 r
116 }
92a42be0
SL
117 fn fatal(&self, m: &str) -> FatalError {
118 self.sp_diag.span_fatal(self.cur_span, m)
1a4d82fc
JJ
119 }
120 fn err(&self, m: &str) {
121 self.sp_diag.span_err(self.cur_span, m);
122 }
123 fn peek(&self) -> TokenAndSpan {
124 TokenAndSpan {
125 tok: self.cur_tok.clone(),
126 sp: self.cur_span,
127 }
128 }
129}
130
1a4d82fc
JJ
131impl<'a> StringReader<'a> {
132 /// For comments.rs, which hackily pokes into pos and curr
9cc50fc6
SL
133 pub fn new_raw<'b>(span_diagnostic: &'b Handler,
134 filemap: Rc<codemap::FileMap>)
135 -> StringReader<'b> {
c34b1796 136 if filemap.src.is_none() {
9cc50fc6
SL
137 span_diagnostic.bug(&format!("Cannot lex filemap \
138 without source: {}",
139 filemap.name)[..]);
c34b1796
AL
140 }
141
142 let source_text = (*filemap.src.as_ref().unwrap()).clone();
143
1a4d82fc
JJ
144 let mut sr = StringReader {
145 span_diagnostic: span_diagnostic,
146 pos: filemap.start_pos,
147 last_pos: filemap.start_pos,
148 col: CharPos(0),
149 curr: Some('\n'),
150 filemap: filemap,
9cc50fc6 151 // dummy values; not read
1a4d82fc
JJ
152 peek_tok: token::Eof,
153 peek_span: codemap::DUMMY_SP,
9cc50fc6 154 source_text: source_text,
1a4d82fc
JJ
155 };
156 sr.bump();
157 sr
158 }
159
9cc50fc6
SL
160 pub fn new<'b>(span_diagnostic: &'b Handler,
161 filemap: Rc<codemap::FileMap>)
162 -> StringReader<'b> {
1a4d82fc
JJ
163 let mut sr = StringReader::new_raw(span_diagnostic, filemap);
164 sr.advance_token();
165 sr
166 }
167
168 pub fn curr_is(&self, c: char) -> bool {
169 self.curr == Some(c)
170 }
171
172 /// Report a fatal lexical error with a given span.
92a42be0
SL
173 pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
174 self.span_diagnostic.span_fatal(sp, m)
1a4d82fc
JJ
175 }
176
177 /// Report a lexical error with a given span.
178 pub fn err_span(&self, sp: Span, m: &str) {
179 self.span_diagnostic.span_err(sp, m)
180 }
181
c1a9b12d 182
1a4d82fc 183 /// Report a fatal error spanning [`from_pos`, `to_pos`).
92a42be0 184 fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
1a4d82fc
JJ
185 self.fatal_span(codemap::mk_sp(from_pos, to_pos), m)
186 }
187
188 /// Report a lexical error spanning [`from_pos`, `to_pos`).
189 fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
190 self.err_span(codemap::mk_sp(from_pos, to_pos), m)
191 }
192
193 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
194 /// escaped character to the error message
92a42be0 195 fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
1a4d82fc
JJ
196 let mut m = m.to_string();
197 m.push_str(": ");
9cc50fc6
SL
198 for c in c.escape_default() {
199 m.push(c)
200 }
92a42be0 201 self.fatal_span_(from_pos, to_pos, &m[..])
1a4d82fc 202 }
9cc50fc6
SL
203 fn struct_fatal_span_char(&self,
204 from_pos: BytePos,
205 to_pos: BytePos,
206 m: &str,
207 c: char)
208 -> DiagnosticBuilder<'a> {
209 let mut m = m.to_string();
210 m.push_str(": ");
211 for c in c.escape_default() {
212 m.push(c)
213 }
214 self.span_diagnostic.struct_span_fatal(codemap::mk_sp(from_pos, to_pos), &m[..])
215 }
1a4d82fc
JJ
216
217 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
218 /// escaped character to the error message
219 fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
220 let mut m = m.to_string();
221 m.push_str(": ");
9cc50fc6
SL
222 for c in c.escape_default() {
223 m.push(c)
224 }
85aaf69f 225 self.err_span_(from_pos, to_pos, &m[..]);
1a4d82fc 226 }
9cc50fc6
SL
227 fn struct_err_span_char(&self,
228 from_pos: BytePos,
229 to_pos: BytePos,
230 m: &str,
231 c: char)
232 -> DiagnosticBuilder<'a> {
233 let mut m = m.to_string();
234 m.push_str(": ");
235 for c in c.escape_default() {
236 m.push(c)
237 }
238 self.span_diagnostic.struct_span_err(codemap::mk_sp(from_pos, to_pos), &m[..])
239 }
1a4d82fc
JJ
240
241 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
242 /// offending string to the error message
92a42be0 243 fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
1a4d82fc 244 m.push_str(": ");
85aaf69f
SL
245 let from = self.byte_offset(from_pos).to_usize();
246 let to = self.byte_offset(to_pos).to_usize();
c34b1796 247 m.push_str(&self.source_text[from..to]);
92a42be0 248 self.fatal_span_(from_pos, to_pos, &m[..])
1a4d82fc
JJ
249 }
250
251 /// Advance peek_tok and peek_span to refer to the next token, and
252 /// possibly update the interner.
253 fn advance_token(&mut self) {
254 match self.scan_whitespace_or_comment() {
255 Some(comment) => {
256 self.peek_span = comment.sp;
257 self.peek_tok = comment.tok;
9cc50fc6 258 }
1a4d82fc
JJ
259 None => {
260 if self.is_eof() {
261 self.peek_tok = token::Eof;
c1a9b12d 262 self.peek_span = codemap::mk_sp(self.filemap.end_pos, self.filemap.end_pos);
1a4d82fc
JJ
263 } else {
264 let start_bytepos = self.last_pos;
265 self.peek_tok = self.next_token_inner();
9cc50fc6 266 self.peek_span = codemap::mk_sp(start_bytepos, self.last_pos);
1a4d82fc
JJ
267 };
268 }
269 }
270 }
271
272 fn byte_offset(&self, pos: BytePos) -> BytePos {
273 (pos - self.filemap.start_pos)
274 }
275
276 /// Calls `f` with a string slice of the source text spanning from `start`
277 /// up to but excluding `self.last_pos`, meaning the slice does not include
278 /// the character `self.curr`.
9cc50fc6
SL
279 pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
280 where F: FnOnce(&str) -> T
1a4d82fc
JJ
281 {
282 self.with_str_from_to(start, self.last_pos, f)
283 }
284
285 /// Create a Name from a given offset to the current offset, each
286 /// adjusted 1 towards each other (assumes that on either side there is a
287 /// single-byte delimiter).
288 pub fn name_from(&self, start: BytePos) -> ast::Name {
289 debug!("taking an ident from {:?} to {:?}", start, self.last_pos);
290 self.with_str_from(start, token::intern)
291 }
292
293 /// As name_from, with an explicit endpoint.
294 pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
295 debug!("taking an ident from {:?} to {:?}", start, end);
296 self.with_str_from_to(start, end, token::intern)
297 }
298
299 /// Calls `f` with a string slice of the source text spanning from `start`
300 /// up to but excluding `end`.
9cc50fc6
SL
301 fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
302 where F: FnOnce(&str) -> T
1a4d82fc 303 {
9cc50fc6 304 f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
1a4d82fc
JJ
305 }
306
307 /// Converts CRLF to LF in the given string, raising an error on bare CR.
9cc50fc6 308 fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
85aaf69f 309 let mut i = 0;
1a4d82fc 310 while i < s.len() {
d9579d0f 311 let ch = char_at(s, i);
c34b1796 312 let next = i + ch.len_utf8();
1a4d82fc 313 if ch == '\r' {
d9579d0f
AL
314 if next < s.len() && char_at(s, next) == '\n' {
315 return translate_crlf_(self, start, s, errmsg, i).into();
1a4d82fc
JJ
316 }
317 let pos = start + BytePos(i as u32);
318 let end_pos = start + BytePos(next as u32);
319 self.err_span_(pos, end_pos, errmsg);
320 }
321 i = next;
322 }
d9579d0f 323 return s.into();
1a4d82fc 324
9cc50fc6
SL
325 fn translate_crlf_(rdr: &StringReader,
326 start: BytePos,
327 s: &str,
328 errmsg: &str,
329 mut i: usize)
330 -> String {
1a4d82fc
JJ
331 let mut buf = String::with_capacity(s.len());
332 let mut j = 0;
333 while i < s.len() {
d9579d0f 334 let ch = char_at(s, i);
c34b1796 335 let next = i + ch.len_utf8();
1a4d82fc 336 if ch == '\r' {
9cc50fc6
SL
337 if j < i {
338 buf.push_str(&s[j..i]);
339 }
1a4d82fc 340 j = next;
d9579d0f 341 if next >= s.len() || char_at(s, next) != '\n' {
1a4d82fc
JJ
342 let pos = start + BytePos(i as u32);
343 let end_pos = start + BytePos(next as u32);
344 rdr.err_span_(pos, end_pos, errmsg);
345 }
346 }
347 i = next;
348 }
9cc50fc6
SL
349 if j < s.len() {
350 buf.push_str(&s[j..]);
351 }
1a4d82fc
JJ
352 buf
353 }
354 }
355
356
357 /// Advance the StringReader by one character. If a newline is
358 /// discovered, add it to the FileMap's list of line start offsets.
359 pub fn bump(&mut self) {
360 self.last_pos = self.pos;
85aaf69f 361 let current_byte_offset = self.byte_offset(self.pos).to_usize();
c34b1796 362 if current_byte_offset < self.source_text.len() {
1a4d82fc
JJ
363 assert!(self.curr.is_some());
364 let last_char = self.curr.unwrap();
d9579d0f 365 let ch = char_at(&self.source_text, current_byte_offset);
c34b1796
AL
366 let next = current_byte_offset + ch.len_utf8();
367 let byte_offset_diff = next - current_byte_offset;
85aaf69f 368 self.pos = self.pos + Pos::from_usize(byte_offset_diff);
c34b1796 369 self.curr = Some(ch);
85aaf69f 370 self.col = self.col + CharPos(1);
1a4d82fc
JJ
371 if last_char == '\n' {
372 self.filemap.next_line(self.last_pos);
85aaf69f 373 self.col = CharPos(0);
1a4d82fc
JJ
374 }
375
376 if byte_offset_diff > 1 {
377 self.filemap.record_multibyte_char(self.last_pos, byte_offset_diff);
378 }
379 } else {
380 self.curr = None;
381 }
382 }
383
384 pub fn nextch(&self) -> Option<char> {
85aaf69f 385 let offset = self.byte_offset(self.pos).to_usize();
c34b1796 386 if offset < self.source_text.len() {
d9579d0f 387 Some(char_at(&self.source_text, offset))
1a4d82fc
JJ
388 } else {
389 None
390 }
391 }
392
393 pub fn nextch_is(&self, c: char) -> bool {
394 self.nextch() == Some(c)
395 }
396
397 pub fn nextnextch(&self) -> Option<char> {
85aaf69f 398 let offset = self.byte_offset(self.pos).to_usize();
c34b1796 399 let s = &self.source_text[..];
9cc50fc6
SL
400 if offset >= s.len() {
401 return None;
402 }
d9579d0f 403 let next = offset + char_at(s, offset).len_utf8();
1a4d82fc 404 if next < s.len() {
d9579d0f 405 Some(char_at(s, next))
1a4d82fc
JJ
406 } else {
407 None
408 }
409 }
410
411 pub fn nextnextch_is(&self, c: char) -> bool {
412 self.nextnextch() == Some(c)
413 }
414
415 /// Eats <XID_start><XID_continue>*, if possible.
416 fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
417 if !ident_start(self.curr) {
9cc50fc6 418 return None;
1a4d82fc
JJ
419 }
420 let start = self.last_pos;
421 while ident_continue(self.curr) {
422 self.bump();
423 }
424
425 self.with_str_from(start, |string| {
426 if string == "_" {
427 None
428 } else {
429 Some(token::intern(string))
430 }
431 })
432 }
433
434 /// PRECONDITION: self.curr is not whitespace
435 /// Eats any kind of comment.
436 fn scan_comment(&mut self) -> Option<TokenAndSpan> {
437 match self.curr {
438 Some(c) => {
439 if c.is_whitespace() {
440 self.span_diagnostic.span_err(codemap::mk_sp(self.last_pos, self.last_pos),
9cc50fc6
SL
441 "called consume_any_line_comment, but there \
442 was whitespace");
1a4d82fc 443 }
9cc50fc6
SL
444 }
445 None => {}
1a4d82fc
JJ
446 }
447
448 if self.curr_is('/') {
449 match self.nextch() {
450 Some('/') => {
451 self.bump();
452 self.bump();
62682a34 453
1a4d82fc 454 // line comments starting with "///" or "//!" are doc-comments
62682a34
SL
455 let doc_comment = self.curr_is('/') || self.curr_is('!');
456 let start_bpos = if doc_comment {
457 self.pos - BytePos(3)
458 } else {
459 self.last_pos - BytePos(2)
460 };
461
462 while !self.is_eof() {
463 match self.curr.unwrap() {
464 '\n' => break,
465 '\r' => {
466 if self.nextch_is('\n') {
467 // CRLF
9cc50fc6 468 break;
62682a34 469 } else if doc_comment {
9cc50fc6
SL
470 self.err_span_(self.last_pos,
471 self.pos,
62682a34 472 "bare CR not allowed in doc-comment");
1a4d82fc 473 }
1a4d82fc 474 }
9cc50fc6 475 _ => (),
1a4d82fc 476 }
62682a34
SL
477 self.bump();
478 }
479
480 return if doc_comment {
481 self.with_str_from(start_bpos, |string| {
482 // comments with only more "/"s are not doc comments
1a4d82fc
JJ
483 let tok = if is_doc_comment(string) {
484 token::DocComment(token::intern(string))
485 } else {
486 token::Comment
487 };
488
62682a34 489 Some(TokenAndSpan {
1a4d82fc 490 tok: tok,
9cc50fc6 491 sp: codemap::mk_sp(start_bpos, self.last_pos),
62682a34
SL
492 })
493 })
1a4d82fc 494 } else {
62682a34 495 Some(TokenAndSpan {
1a4d82fc 496 tok: token::Comment,
9cc50fc6 497 sp: codemap::mk_sp(start_bpos, self.last_pos),
62682a34 498 })
9cc50fc6 499 };
1a4d82fc
JJ
500 }
501 Some('*') => {
9cc50fc6
SL
502 self.bump();
503 self.bump();
1a4d82fc
JJ
504 self.scan_block_comment()
505 }
9cc50fc6 506 _ => None,
1a4d82fc
JJ
507 }
508 } else if self.curr_is('#') {
509 if self.nextch_is('!') {
510
511 // Parse an inner attribute.
512 if self.nextnextch_is('[') {
513 return None;
514 }
515
516 // I guess this is the only way to figure out if
517 // we're at the beginning of the file...
518 let cmap = CodeMap::new();
519 cmap.files.borrow_mut().push(self.filemap.clone());
520 let loc = cmap.lookup_char_pos_adj(self.last_pos);
521 debug!("Skipping a shebang");
85aaf69f 522 if loc.line == 1 && loc.col == CharPos(0) {
1a4d82fc
JJ
523 // FIXME: Add shebang "token", return it
524 let start = self.last_pos;
9cc50fc6
SL
525 while !self.curr_is('\n') && !self.is_eof() {
526 self.bump();
527 }
1a4d82fc
JJ
528 return Some(TokenAndSpan {
529 tok: token::Shebang(self.name_from(start)),
9cc50fc6 530 sp: codemap::mk_sp(start, self.last_pos),
1a4d82fc
JJ
531 });
532 }
533 }
534 None
535 } else {
536 None
537 }
538 }
539
540 /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
541 /// return None.
542 fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
543 match self.curr.unwrap_or('\0') {
544 // # to handle shebang at start of file -- this is the entry point
545 // for skipping over all "junk"
546 '/' | '#' => {
547 let c = self.scan_comment();
548 debug!("scanning a comment {:?}", c);
549 c
54a0048b
SL
550 },
551 c if is_pattern_whitespace(Some(c)) => {
1a4d82fc 552 let start_bpos = self.last_pos;
54a0048b 553 while is_pattern_whitespace(self.curr) {
9cc50fc6
SL
554 self.bump();
555 }
1a4d82fc
JJ
556 let c = Some(TokenAndSpan {
557 tok: token::Whitespace,
9cc50fc6 558 sp: codemap::mk_sp(start_bpos, self.last_pos),
1a4d82fc
JJ
559 });
560 debug!("scanning whitespace: {:?}", c);
561 c
9cc50fc6
SL
562 }
563 _ => None,
1a4d82fc
JJ
564 }
565 }
566
567 /// Might return a sugared-doc-attr
568 fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
569 // block comments starting with "/**" or "/*!" are doc-comments
570 let is_doc_comment = self.curr_is('*') || self.curr_is('!');
571 let start_bpos = self.last_pos - BytePos(2);
572
85aaf69f 573 let mut level: isize = 1;
1a4d82fc
JJ
574 let mut has_cr = false;
575 while level > 0 {
576 if self.is_eof() {
577 let msg = if is_doc_comment {
578 "unterminated block doc-comment"
579 } else {
580 "unterminated block comment"
581 };
582 let last_bpos = self.last_pos;
92a42be0 583 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
1a4d82fc
JJ
584 }
585 let n = self.curr.unwrap();
586 match n {
587 '/' if self.nextch_is('*') => {
588 level += 1;
589 self.bump();
590 }
591 '*' if self.nextch_is('/') => {
592 level -= 1;
593 self.bump();
594 }
595 '\r' => {
596 has_cr = true;
597 }
9cc50fc6 598 _ => (),
1a4d82fc
JJ
599 }
600 self.bump();
601 }
602
603 self.with_str_from(start_bpos, |string| {
604 // but comments with only "*"s between two "/"s are not
605 let tok = if is_block_doc_comment(string) {
606 let string = if has_cr {
9cc50fc6
SL
607 self.translate_crlf(start_bpos,
608 string,
1a4d82fc 609 "bare CR not allowed in block doc-comment")
9cc50fc6
SL
610 } else {
611 string.into()
612 };
85aaf69f 613 token::DocComment(token::intern(&string[..]))
1a4d82fc
JJ
614 } else {
615 token::Comment
616 };
617
9cc50fc6 618 Some(TokenAndSpan {
1a4d82fc 619 tok: tok,
9cc50fc6 620 sp: codemap::mk_sp(start_bpos, self.last_pos),
1a4d82fc
JJ
621 })
622 })
623 }
624
c34b1796
AL
625 /// Scan through any digits (base `scan_radix`) or underscores,
626 /// and return how many digits there were.
627 ///
628 /// `real_radix` represents the true radix of the number we're
629 /// interested in, and errors will be emitted for any digits
630 /// between `real_radix` and `scan_radix`.
631 fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
632 assert!(real_radix <= scan_radix);
85aaf69f 633 let mut len = 0;
1a4d82fc
JJ
634 loop {
635 let c = self.curr;
9cc50fc6
SL
636 if c == Some('_') {
637 debug!("skipping a _");
638 self.bump();
639 continue;
640 }
c34b1796 641 match c.and_then(|cc| cc.to_digit(scan_radix)) {
1a4d82fc
JJ
642 Some(_) => {
643 debug!("{:?} in scan_digits", c);
c34b1796
AL
644 // check that the hypothetical digit is actually
645 // in range for the true radix
646 if c.unwrap().to_digit(real_radix).is_none() {
9cc50fc6
SL
647 self.err_span_(self.last_pos,
648 self.pos,
649 &format!("invalid digit for a base {} literal", real_radix));
c34b1796 650 }
1a4d82fc
JJ
651 len += 1;
652 self.bump();
653 }
9cc50fc6 654 _ => return len,
1a4d82fc 655 }
9cc50fc6 656 }
1a4d82fc
JJ
657 }
658
659 /// Lex a LIT_INTEGER or a LIT_FLOAT
660 fn scan_number(&mut self, c: char) -> token::Lit {
c1a9b12d 661 let num_digits;
1a4d82fc
JJ
662 let mut base = 10;
663 let start_bpos = self.last_pos;
664
665 self.bump();
666
667 if c == '0' {
668 match self.curr.unwrap_or('\0') {
9cc50fc6
SL
669 'b' => {
670 self.bump();
671 base = 2;
672 num_digits = self.scan_digits(2, 10);
673 }
674 'o' => {
675 self.bump();
676 base = 8;
677 num_digits = self.scan_digits(8, 10);
678 }
679 'x' => {
680 self.bump();
681 base = 16;
682 num_digits = self.scan_digits(16, 16);
683 }
1a4d82fc 684 '0'...'9' | '_' | '.' => {
c34b1796 685 num_digits = self.scan_digits(10, 10) + 1;
1a4d82fc
JJ
686 }
687 _ => {
688 // just a 0
689 return token::Integer(self.name_from(start_bpos));
690 }
691 }
692 } else if c.is_digit(10) {
c34b1796 693 num_digits = self.scan_digits(10, 10) + 1;
1a4d82fc
JJ
694 } else {
695 num_digits = 0;
696 }
697
698 if num_digits == 0 {
9cc50fc6
SL
699 self.err_span_(start_bpos,
700 self.last_pos,
701 "no valid digits found for number");
1a4d82fc
JJ
702 return token::Integer(token::intern("0"));
703 }
704
705 // might be a float, but don't be greedy if this is actually an
706 // integer literal followed by field/method access or a range pattern
707 // (`0..2` and `12.foo()`)
9cc50fc6
SL
708 if self.curr_is('.') && !self.nextch_is('.') &&
709 !self.nextch()
710 .unwrap_or('\0')
711 .is_xid_start() {
1a4d82fc
JJ
712 // might have stuff after the ., and if it does, it needs to start
713 // with a number
714 self.bump();
715 if self.curr.unwrap_or('\0').is_digit(10) {
c34b1796 716 self.scan_digits(10, 10);
1a4d82fc
JJ
717 self.scan_float_exponent();
718 }
719 let last_pos = self.last_pos;
720 self.check_float_base(start_bpos, last_pos, base);
721 return token::Float(self.name_from(start_bpos));
722 } else {
723 // it might be a float if it has an exponent
724 if self.curr_is('e') || self.curr_is('E') {
725 self.scan_float_exponent();
726 let last_pos = self.last_pos;
727 self.check_float_base(start_bpos, last_pos, base);
728 return token::Float(self.name_from(start_bpos));
729 }
730 // but we certainly have an integer!
731 return token::Integer(self.name_from(start_bpos));
732 }
733 }
734
735 /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
736 /// error if too many or too few digits are encountered.
9cc50fc6 737 fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
1a4d82fc
JJ
738 debug!("scanning {} digits until {:?}", n_digits, delim);
739 let start_bpos = self.last_pos;
740 let mut accum_int = 0;
741
c34b1796 742 let mut valid = true;
85aaf69f 743 for _ in 0..n_digits {
1a4d82fc
JJ
744 if self.is_eof() {
745 let last_bpos = self.last_pos;
92a42be0
SL
746 panic!(self.fatal_span_(start_bpos,
747 last_bpos,
748 "unterminated numeric character escape"));
1a4d82fc
JJ
749 }
750 if self.curr_is(delim) {
751 let last_bpos = self.last_pos;
9cc50fc6
SL
752 self.err_span_(start_bpos,
753 last_bpos,
754 "numeric character escape is too short");
c34b1796 755 valid = false;
1a4d82fc
JJ
756 break;
757 }
758 let c = self.curr.unwrap_or('\x00');
759 accum_int *= 16;
760 accum_int += c.to_digit(16).unwrap_or_else(|| {
9cc50fc6
SL
761 self.err_span_char(self.last_pos,
762 self.pos,
763 "invalid character in numeric character escape",
764 c);
c34b1796
AL
765
766 valid = false;
1a4d82fc 767 0
c34b1796 768 });
1a4d82fc
JJ
769 self.bump();
770 }
771
772 if below_0x7f_only && accum_int >= 0x80 {
773 self.err_span_(start_bpos,
774 self.last_pos,
9cc50fc6
SL
775 "this form of character escape may only be used with characters in \
776 the range [\\x00-\\x7f]");
c34b1796 777 valid = false;
1a4d82fc
JJ
778 }
779
780 match char::from_u32(accum_int) {
c34b1796 781 Some(_) => valid,
1a4d82fc
JJ
782 None => {
783 let last_bpos = self.last_pos;
c1a9b12d 784 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
1a4d82fc
JJ
785 false
786 }
787 }
788 }
789
1a4d82fc
JJ
790 /// Scan for a single (possibly escaped) byte or char
791 /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
792 /// `start` is the position of `first_source_char`, which is already consumed.
793 ///
794 /// Returns true if there was a valid char/byte, false otherwise.
9cc50fc6
SL
795 fn scan_char_or_byte(&mut self,
796 start: BytePos,
797 first_source_char: char,
798 ascii_only: bool,
799 delim: char)
800 -> bool {
1a4d82fc
JJ
801 match first_source_char {
802 '\\' => {
803 // '\X' for some X must be a character constant:
804 let escaped = self.curr;
805 let escaped_pos = self.last_pos;
806 self.bump();
807 match escaped {
9cc50fc6 808 None => {} // EOF here is an error that will be checked later.
1a4d82fc
JJ
809 Some(e) => {
810 return match e {
811 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
812 'x' => self.scan_byte_escape(delim, !ascii_only),
c1a9b12d
SL
813 'u' => {
814 let valid = if self.curr_is('{') {
815 self.scan_unicode_escape(delim) && !ascii_only
816 } else {
9cc50fc6
SL
817 let span = codemap::mk_sp(start, self.last_pos);
818 self.span_diagnostic
819 .struct_span_err(span, "incorrect unicode escape sequence")
820 .span_help(span,
821 "format of unicode escape sequences is \
822 `\\u{…}`")
823 .emit();
c1a9b12d
SL
824 false
825 };
826 if ascii_only {
9cc50fc6
SL
827 self.err_span_(start,
828 self.last_pos,
829 "unicode escape sequences cannot be used as a \
830 byte or in a byte string");
62682a34 831 }
c1a9b12d
SL
832 valid
833
1a4d82fc
JJ
834 }
835 '\n' if delim == '"' => {
836 self.consume_whitespace();
837 true
9cc50fc6 838 }
1a4d82fc
JJ
839 '\r' if delim == '"' && self.curr_is('\n') => {
840 self.consume_whitespace();
841 true
842 }
843 c => {
844 let last_pos = self.last_pos;
9cc50fc6
SL
845 let mut err = self.struct_err_span_char(escaped_pos,
846 last_pos,
847 if ascii_only {
848 "unknown byte escape"
849 } else {
850 "unknown character \
851 escape"
852 },
853 c);
1a4d82fc 854 if e == '\r' {
9cc50fc6
SL
855 err.span_help(codemap::mk_sp(escaped_pos, last_pos),
856 "this is an isolated carriage return; consider \
857 checking your editor and version control \
858 settings");
1a4d82fc 859 }
9346a6ac 860 if (e == '{' || e == '}') && !ascii_only {
9cc50fc6
SL
861 err.span_help(codemap::mk_sp(escaped_pos, last_pos),
862 "if used in a formatting string, curly braces \
863 are escaped with `{{` and `}}`");
9346a6ac 864 }
9cc50fc6 865 err.emit();
1a4d82fc
JJ
866 false
867 }
868 }
869 }
870 }
871 }
872 '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
873 let last_pos = self.last_pos;
9cc50fc6
SL
874 self.err_span_char(start,
875 last_pos,
876 if ascii_only {
877 "byte constant must be escaped"
878 } else {
879 "character constant must be escaped"
880 },
881 first_source_char);
1a4d82fc
JJ
882 return false;
883 }
884 '\r' => {
885 if self.curr_is('\n') {
886 self.bump();
887 return true;
888 } else {
9cc50fc6
SL
889 self.err_span_(start,
890 self.last_pos,
1a4d82fc
JJ
891 "bare CR not allowed in string, use \\r instead");
892 return false;
893 }
894 }
9cc50fc6
SL
895 _ => {
896 if ascii_only && first_source_char > '\x7F' {
897 let last_pos = self.last_pos;
898 self.err_span_char(start,
899 last_pos,
900 "byte constant must be ASCII. Use a \\xHH escape for a \
901 non-ASCII byte",
902 first_source_char);
903 return false;
904 }
1a4d82fc
JJ
905 }
906 }
907 true
908 }
909
910 /// Scan over a \u{...} escape
911 ///
912 /// At this point, we have already seen the \ and the u, the { is the current character. We
913 /// will read at least one digit, and up to 6, and pass over the }.
914 fn scan_unicode_escape(&mut self, delim: char) -> bool {
915 self.bump(); // past the {
916 let start_bpos = self.last_pos;
85aaf69f 917 let mut count = 0;
1a4d82fc 918 let mut accum_int = 0;
c34b1796 919 let mut valid = true;
1a4d82fc
JJ
920
921 while !self.curr_is('}') && count <= 6 {
922 let c = match self.curr {
923 Some(c) => c,
924 None => {
9cc50fc6
SL
925 panic!(self.fatal_span_(start_bpos,
926 self.last_pos,
92a42be0 927 "unterminated unicode escape (found EOF)"));
1a4d82fc
JJ
928 }
929 };
930 accum_int *= 16;
931 accum_int += c.to_digit(16).unwrap_or_else(|| {
932 if c == delim {
9cc50fc6
SL
933 panic!(self.fatal_span_(self.last_pos,
934 self.pos,
92a42be0 935 "unterminated unicode escape (needed a `}`)"));
1a4d82fc 936 } else {
9cc50fc6
SL
937 self.err_span_char(self.last_pos,
938 self.pos,
939 "invalid character in unicode escape",
940 c);
1a4d82fc 941 }
c34b1796
AL
942 valid = false;
943 0
944 });
1a4d82fc
JJ
945 self.bump();
946 count += 1;
947 }
948
949 if count > 6 {
9cc50fc6
SL
950 self.err_span_(start_bpos,
951 self.last_pos,
952 "overlong unicode escape (can have at most 6 hex digits)");
c34b1796 953 valid = false;
1a4d82fc
JJ
954 }
955
c34b1796 956 if valid && (char::from_u32(accum_int).is_none() || count == 0) {
9cc50fc6
SL
957 self.err_span_(start_bpos,
958 self.last_pos,
959 "invalid unicode character escape");
62682a34 960 valid = false;
1a4d82fc
JJ
961 }
962
c1a9b12d 963 self.bump(); // past the ending }
1a4d82fc
JJ
964 valid
965 }
966
967 /// Scan over a float exponent.
968 fn scan_float_exponent(&mut self) {
969 if self.curr_is('e') || self.curr_is('E') {
970 self.bump();
971 if self.curr_is('-') || self.curr_is('+') {
972 self.bump();
973 }
c34b1796 974 if self.scan_digits(10, 10) == 0 {
9cc50fc6
SL
975 self.err_span_(self.last_pos,
976 self.pos,
977 "expected at least one digit in exponent")
1a4d82fc
JJ
978 }
979 }
980 }
981
982 /// Check that a base is valid for a floating literal, emitting a nice
983 /// error if it isn't.
85aaf69f 984 fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1a4d82fc 985 match base {
9cc50fc6
SL
986 16 => {
987 self.err_span_(start_bpos,
988 last_bpos,
989 "hexadecimal float literal is not supported")
990 }
991 8 => {
992 self.err_span_(start_bpos,
993 last_bpos,
994 "octal float literal is not supported")
995 }
996 2 => {
997 self.err_span_(start_bpos,
998 last_bpos,
999 "binary float literal is not supported")
1000 }
1001 _ => (),
1a4d82fc
JJ
1002 }
1003 }
1004
1005 fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1006 self.bump();
1007 if self.curr_is('=') {
1008 self.bump();
1009 return token::BinOpEq(op);
1010 } else {
1011 return token::BinOp(op);
1012 }
1013 }
1014
1015 /// Return the next token from the string, advances the input past that
1016 /// token, and updates the interner
1017 fn next_token_inner(&mut self) -> token::Token {
1018 let c = self.curr;
9cc50fc6
SL
1019 if ident_start(c) &&
1020 match (c.unwrap(), self.nextch(), self.nextnextch()) {
1a4d82fc
JJ
1021 // Note: r as in r" or r#" is part of a raw string literal,
1022 // b as in b' is part of a byte literal.
1023 // They are not identifiers, and are handled further down.
9cc50fc6
SL
1024 ('r', Some('"'), _) |
1025 ('r', Some('#'), _) |
1026 ('b', Some('"'), _) |
1027 ('b', Some('\''), _) |
1028 ('b', Some('r'), Some('"')) |
1029 ('b', Some('r'), Some('#')) => false,
1030 _ => true,
1a4d82fc
JJ
1031 } {
1032 let start = self.last_pos;
1033 while ident_continue(self.curr) {
1034 self.bump();
1035 }
1036
1037 return self.with_str_from(start, |string| {
1038 if string == "_" {
1039 token::Underscore
1040 } else {
1041 // FIXME: perform NFKC normalization here. (Issue #2253)
1042 if self.curr_is(':') && self.nextch_is(':') {
1043 token::Ident(str_to_ident(string), token::ModName)
1044 } else {
1045 token::Ident(str_to_ident(string), token::Plain)
1046 }
1047 }
1048 });
1049 }
1050
1051 if is_dec_digit(c) {
1052 let num = self.scan_number(c.unwrap());
1053 let suffix = self.scan_optional_raw_name();
1054 debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
9cc50fc6 1055 return token::Literal(num, suffix);
1a4d82fc
JJ
1056 }
1057
1a4d82fc 1058 match c.expect("next_token_inner called at EOF") {
9cc50fc6
SL
1059 // One-byte tokens.
1060 ';' => {
1a4d82fc 1061 self.bump();
9cc50fc6
SL
1062 return token::Semi;
1063 }
1064 ',' => {
1065 self.bump();
1066 return token::Comma;
1067 }
1068 '.' => {
1069 self.bump();
1070 return if self.curr_is('.') {
1071 self.bump();
1072 if self.curr_is('.') {
1073 self.bump();
1074 token::DotDotDot
1075 } else {
1076 token::DotDot
1077 }
1078 } else {
1079 token::Dot
1080 };
1081 }
1082 '(' => {
1083 self.bump();
1084 return token::OpenDelim(token::Paren);
1085 }
1086 ')' => {
1087 self.bump();
1088 return token::CloseDelim(token::Paren);
1089 }
1090 '{' => {
1091 self.bump();
1092 return token::OpenDelim(token::Brace);
1093 }
1094 '}' => {
1095 self.bump();
1096 return token::CloseDelim(token::Brace);
1097 }
1098 '[' => {
1099 self.bump();
1100 return token::OpenDelim(token::Bracket);
1101 }
1102 ']' => {
1103 self.bump();
1104 return token::CloseDelim(token::Bracket);
1105 }
1106 '@' => {
1107 self.bump();
1108 return token::At;
1109 }
1110 '#' => {
1111 self.bump();
1112 return token::Pound;
1113 }
1114 '~' => {
1115 self.bump();
1116 return token::Tilde;
1117 }
1118 '?' => {
1119 self.bump();
1120 return token::Question;
1121 }
1122 ':' => {
1123 self.bump();
1124 if self.curr_is(':') {
1125 self.bump();
1126 return token::ModSep;
1127 } else {
1128 return token::Colon;
1129 }
1a4d82fc 1130 }
1a4d82fc 1131
9cc50fc6
SL
1132 '$' => {
1133 self.bump();
1134 return token::Dollar;
1135 }
1a4d82fc 1136
9cc50fc6
SL
1137 // Multi-byte tokens.
1138 '=' => {
1a4d82fc 1139 self.bump();
9cc50fc6
SL
1140 if self.curr_is('=') {
1141 self.bump();
1142 return token::EqEq;
1143 } else if self.curr_is('>') {
1144 self.bump();
1145 return token::FatArrow;
1146 } else {
1147 return token::Eq;
1148 }
1149 }
1150 '!' => {
1a4d82fc 1151 self.bump();
9cc50fc6
SL
1152 if self.curr_is('=') {
1153 self.bump();
1154 return token::Ne;
1155 } else {
1156 return token::Not;
1157 }
1a4d82fc 1158 }
9cc50fc6 1159 '<' => {
1a4d82fc 1160 self.bump();
9cc50fc6
SL
1161 match self.curr.unwrap_or('\x00') {
1162 '=' => {
1163 self.bump();
1164 return token::Le;
1165 }
1166 '<' => {
1167 return self.binop(token::Shl);
1168 }
1169 '-' => {
1170 self.bump();
1171 match self.curr.unwrap_or('\x00') {
1172 _ => {
1173 return token::LArrow;
1174 }
1175 }
1176 }
1177 _ => {
1178 return token::Lt;
1179 }
1180 }
1181 }
1182 '>' => {
1a4d82fc
JJ
1183 self.bump();
1184 match self.curr.unwrap_or('\x00') {
9cc50fc6
SL
1185 '=' => {
1186 self.bump();
1187 return token::Ge;
1188 }
1189 '>' => {
1190 return self.binop(token::Shr);
1191 }
1192 _ => {
1193 return token::Gt;
1194 }
1a4d82fc 1195 }
1a4d82fc 1196 }
9cc50fc6
SL
1197 '\'' => {
1198 // Either a character constant 'a' OR a lifetime name 'abc
1199 let start_with_quote = self.last_pos;
1200 self.bump();
1201 let start = self.last_pos;
1a4d82fc 1202
9cc50fc6
SL
1203 // the eof will be picked up by the final `'` check below
1204 let c2 = self.curr.unwrap_or('\x00');
1205 self.bump();
1a4d82fc 1206
9cc50fc6
SL
1207 // If the character is an ident start not followed by another single
1208 // quote, then this is a lifetime name:
1209 if ident_start(Some(c2)) && !self.curr_is('\'') {
1210 while ident_continue(self.curr) {
1211 self.bump();
1212 }
1213 // lifetimes shouldn't end with a single quote
1214 // if we find one, then this is an invalid character literal
1215 if self.curr_is('\'') {
1216 panic!(self.fatal_span_verbose(
1217 start_with_quote, self.pos,
1218 String::from("character literal may only contain one codepoint")));
1a4d82fc 1219
9cc50fc6 1220 }
1a4d82fc 1221
9cc50fc6
SL
1222 // Include the leading `'` in the real identifier, for macro
1223 // expansion purposes. See #12512 for the gory details of why
1224 // this is necessary.
1225 let ident = self.with_str_from(start, |lifetime_name| {
1226 str_to_ident(&format!("'{}", lifetime_name))
1227 });
1228
1229 // Conjure up a "keyword checking ident" to make sure that
1230 // the lifetime name is not a keyword.
1231 let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1a4d82fc
JJ
1232 str_to_ident(lifetime_name)
1233 });
9cc50fc6
SL
1234 let keyword_checking_token = &token::Ident(keyword_checking_ident,
1235 token::Plain);
1236 let last_bpos = self.last_pos;
1237 if keyword_checking_token.is_keyword(token::keywords::SelfValue) {
1238 self.err_span_(start,
1239 last_bpos,
1240 "invalid lifetime name: 'self is no longer a special \
1241 lifetime");
1242 } else if keyword_checking_token.is_any_keyword() &&
1243 !keyword_checking_token.is_keyword(token::keywords::Static) {
1244 self.err_span_(start, last_bpos, "invalid lifetime name");
1245 }
1246
1247 return token::Lifetime(ident);
1a4d82fc 1248 }
1a4d82fc 1249
9cc50fc6
SL
1250 let valid = self.scan_char_or_byte(start,
1251 c2,
1252 // ascii_only =
1253 false,
1254 '\'');
92a42be0 1255
9cc50fc6
SL
1256 if !self.curr_is('\'') {
1257 panic!(self.fatal_span_verbose(
1258 start_with_quote, self.last_pos,
1259 String::from("character literal may only contain one codepoint")));
1a4d82fc
JJ
1260 }
1261
9cc50fc6
SL
1262 let id = if valid {
1263 self.name_from(start)
1264 } else {
1265 token::intern("0")
1266 };
1267 self.bump(); // advance curr past token
1268 let suffix = self.scan_optional_raw_name();
1269 return token::Literal(token::Char(id), suffix);
1a4d82fc 1270 }
9cc50fc6 1271 'b' => {
1a4d82fc 1272 self.bump();
9cc50fc6
SL
1273 let lit = match self.curr {
1274 Some('\'') => self.scan_byte(),
1275 Some('"') => self.scan_byte_string(),
1276 Some('r') => self.scan_raw_byte_string(),
1277 _ => unreachable!(), // Should have been a token::Ident above.
1278 };
1279 let suffix = self.scan_optional_raw_name();
1280 return token::Literal(lit, suffix);
1a4d82fc 1281 }
9cc50fc6
SL
1282 '"' => {
1283 let start_bpos = self.last_pos;
1284 let mut valid = true;
1285 self.bump();
1286 while !self.curr_is('"') {
1287 if self.is_eof() {
1288 let last_bpos = self.last_pos;
1289 panic!(self.fatal_span_(start_bpos,
1290 last_bpos,
1291 "unterminated double quote string"));
1292 }
1a4d82fc 1293
9cc50fc6
SL
1294 let ch_start = self.last_pos;
1295 let ch = self.curr.unwrap();
1296 self.bump();
1297 valid &= self.scan_char_or_byte(ch_start,
1298 ch,
1299 // ascii_only =
1300 false,
1301 '"');
1302 }
1303 // adjust for the ASCII " at the start of the literal
1304 let id = if valid {
1305 self.name_from(start_bpos + BytePos(1))
1306 } else {
1307 token::intern("??")
1308 };
1309 self.bump();
1310 let suffix = self.scan_optional_raw_name();
1311 return token::Literal(token::Str_(id), suffix);
1a4d82fc 1312 }
9cc50fc6
SL
1313 'r' => {
1314 let start_bpos = self.last_pos;
1315 self.bump();
1316 let mut hash_count = 0;
1317 while self.curr_is('#') {
1318 self.bump();
1319 hash_count += 1;
1320 }
1321
1a4d82fc
JJ
1322 if self.is_eof() {
1323 let last_bpos = self.last_pos;
92a42be0 1324 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
9cc50fc6
SL
1325 } else if !self.curr_is('"') {
1326 let last_bpos = self.last_pos;
1327 let curr_char = self.curr.unwrap();
1328 panic!(self.fatal_span_char(start_bpos,
1329 last_bpos,
1330 "found invalid character; only `#` is allowed \
1331 in raw string delimitation",
1332 curr_char));
1a4d82fc 1333 }
9cc50fc6
SL
1334 self.bump();
1335 let content_start_bpos = self.last_pos;
1336 let mut content_end_bpos;
1337 let mut valid = true;
1338 'outer: loop {
1339 if self.is_eof() {
1340 let last_bpos = self.last_pos;
1341 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1342 }
1343 // if self.curr_is('"') {
1344 // content_end_bpos = self.last_pos;
1345 // for _ in 0..hash_count {
1346 // self.bump();
1347 // if !self.curr_is('#') {
1348 // continue 'outer;
1349 let c = self.curr.unwrap();
1350 match c {
1351 '"' => {
1352 content_end_bpos = self.last_pos;
1353 for _ in 0..hash_count {
1354 self.bump();
1355 if !self.curr_is('#') {
1356 continue 'outer;
1357 }
1a4d82fc 1358 }
9cc50fc6 1359 break;
1a4d82fc 1360 }
9cc50fc6
SL
1361 '\r' => {
1362 if !self.nextch_is('\n') {
1363 let last_bpos = self.last_pos;
1364 self.err_span_(start_bpos,
1365 last_bpos,
1366 "bare CR not allowed in raw string, use \\r \
1367 instead");
1368 valid = false;
1369 }
1a4d82fc 1370 }
9cc50fc6 1371 _ => (),
1a4d82fc 1372 }
9cc50fc6 1373 self.bump();
1a4d82fc
JJ
1374 }
1375 self.bump();
9cc50fc6
SL
1376 let id = if valid {
1377 self.name_from_to(content_start_bpos, content_end_bpos)
1378 } else {
1379 token::intern("??")
1380 };
1381 let suffix = self.scan_optional_raw_name();
1382 return token::Literal(token::StrRaw(id, hash_count), suffix);
1383 }
1384 '-' => {
1385 if self.nextch_is('>') {
1386 self.bump();
1387 self.bump();
1388 return token::RArrow;
1389 } else {
1390 return self.binop(token::Minus);
1391 }
1392 }
1393 '&' => {
1394 if self.nextch_is('&') {
1395 self.bump();
1396 self.bump();
1397 return token::AndAnd;
1398 } else {
1399 return self.binop(token::And);
1400 }
1401 }
1402 '|' => {
1403 match self.nextch() {
1404 Some('|') => {
1405 self.bump();
1406 self.bump();
1407 return token::OrOr;
1408 }
1409 _ => {
1410 return self.binop(token::Or);
1411 }
1412 }
1413 }
1414 '+' => {
1415 return self.binop(token::Plus);
1416 }
1417 '*' => {
1418 return self.binop(token::Star);
1419 }
1420 '/' => {
1421 return self.binop(token::Slash);
1422 }
1423 '^' => {
1424 return self.binop(token::Caret);
1425 }
1426 '%' => {
1427 return self.binop(token::Percent);
1428 }
1429 c => {
1430 let last_bpos = self.last_pos;
1431 let bpos = self.pos;
1432 let mut err = self.struct_fatal_span_char(last_bpos,
1433 bpos,
1434 "unknown start of token",
1435 c);
1436 unicode_chars::check_for_substitution(&self, c, &mut err);
1437 err.emit();
1438 panic!(FatalError);
1a4d82fc 1439 }
1a4d82fc
JJ
1440 }
1441 }
1442
1443 fn consume_whitespace(&mut self) {
54a0048b 1444 while is_pattern_whitespace(self.curr) && !self.is_eof() {
9cc50fc6
SL
1445 self.bump();
1446 }
1a4d82fc
JJ
1447 }
1448
1449 fn read_to_eol(&mut self) -> String {
1450 let mut val = String::new();
1451 while !self.curr_is('\n') && !self.is_eof() {
1452 val.push(self.curr.unwrap());
1453 self.bump();
1454 }
9cc50fc6
SL
1455 if self.curr_is('\n') {
1456 self.bump();
1457 }
1458 return val;
1a4d82fc
JJ
1459 }
1460
1461 fn read_one_line_comment(&mut self) -> String {
1462 let val = self.read_to_eol();
9cc50fc6
SL
1463 assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1464 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1a4d82fc
JJ
1465 return val;
1466 }
1467
1468 fn consume_non_eol_whitespace(&mut self) {
54a0048b 1469 while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
1a4d82fc
JJ
1470 self.bump();
1471 }
1472 }
1473
1474 fn peeking_at_comment(&self) -> bool {
9cc50fc6
SL
1475 (self.curr_is('/') && self.nextch_is('/')) || (self.curr_is('/') && self.nextch_is('*')) ||
1476 // consider shebangs comments, but not inner attributes
1477 (self.curr_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1a4d82fc
JJ
1478 }
1479
1480 fn scan_byte(&mut self) -> token::Lit {
1481 self.bump();
1482 let start = self.last_pos;
1483
1484 // the eof will be picked up by the final `'` check below
1485 let c2 = self.curr.unwrap_or('\x00');
1486 self.bump();
1487
9cc50fc6
SL
1488 let valid = self.scan_char_or_byte(start,
1489 c2,
1490 // ascii_only =
1491 true,
1492 '\'');
1a4d82fc
JJ
1493 if !self.curr_is('\'') {
1494 // Byte offsetting here is okay because the
1495 // character before position `start` are an
1496 // ascii single quote and ascii 'b'.
1497 let last_pos = self.last_pos;
9cc50fc6
SL
1498 panic!(self.fatal_span_verbose(start - BytePos(2),
1499 last_pos,
1500 "unterminated byte constant".to_string()));
1a4d82fc
JJ
1501 }
1502
9cc50fc6
SL
1503 let id = if valid {
1504 self.name_from(start)
1505 } else {
1506 token::intern("?")
1507 };
1a4d82fc
JJ
1508 self.bump(); // advance curr past token
1509 return token::Byte(id);
1510 }
1511
1512 fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1513 self.scan_hex_digits(2, delim, below_0x7f_only)
1514 }
1515
1516 fn scan_byte_string(&mut self) -> token::Lit {
1517 self.bump();
1518 let start = self.last_pos;
1519 let mut valid = true;
1520
1521 while !self.curr_is('"') {
1522 if self.is_eof() {
1523 let last_pos = self.last_pos;
92a42be0 1524 panic!(self.fatal_span_(start, last_pos, "unterminated double quote byte string"));
1a4d82fc
JJ
1525 }
1526
1527 let ch_start = self.last_pos;
1528 let ch = self.curr.unwrap();
1529 self.bump();
9cc50fc6
SL
1530 valid &= self.scan_char_or_byte(ch_start,
1531 ch,
1532 // ascii_only =
1533 true,
1534 '"');
1a4d82fc 1535 }
9cc50fc6
SL
1536 let id = if valid {
1537 self.name_from(start)
1538 } else {
1539 token::intern("??")
1540 };
1a4d82fc 1541 self.bump();
e9174d1e 1542 return token::ByteStr(id);
1a4d82fc
JJ
1543 }
1544
1545 fn scan_raw_byte_string(&mut self) -> token::Lit {
1546 let start_bpos = self.last_pos;
1547 self.bump();
85aaf69f 1548 let mut hash_count = 0;
1a4d82fc
JJ
1549 while self.curr_is('#') {
1550 self.bump();
1551 hash_count += 1;
1552 }
1553
1554 if self.is_eof() {
1555 let last_pos = self.last_pos;
92a42be0 1556 panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string"));
1a4d82fc
JJ
1557 } else if !self.curr_is('"') {
1558 let last_pos = self.last_pos;
1559 let ch = self.curr.unwrap();
9cc50fc6
SL
1560 panic!(self.fatal_span_char(start_bpos,
1561 last_pos,
1562 "found invalid character; only `#` is allowed in raw \
1563 string delimitation",
1564 ch));
1a4d82fc
JJ
1565 }
1566 self.bump();
1567 let content_start_bpos = self.last_pos;
1568 let mut content_end_bpos;
1569 'outer: loop {
1570 match self.curr {
1571 None => {
1572 let last_pos = self.last_pos;
92a42be0 1573 panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string"))
9cc50fc6 1574 }
1a4d82fc
JJ
1575 Some('"') => {
1576 content_end_bpos = self.last_pos;
85aaf69f 1577 for _ in 0..hash_count {
1a4d82fc
JJ
1578 self.bump();
1579 if !self.curr_is('#') {
1580 continue 'outer;
1581 }
1582 }
1583 break;
9cc50fc6
SL
1584 }
1585 Some(c) => {
1586 if c > '\x7F' {
1587 let last_pos = self.last_pos;
1588 self.err_span_char(last_pos, last_pos, "raw byte string must be ASCII", c);
1589 }
1a4d82fc
JJ
1590 }
1591 }
1592 self.bump();
1593 }
1594 self.bump();
9cc50fc6
SL
1595 return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1596 hash_count);
1a4d82fc
JJ
1597 }
1598}
1599
54a0048b
SL
1600// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1601// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1602pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1603 c.map_or(false, Pattern_White_Space)
1a4d82fc
JJ
1604}
1605
1606fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1607 match c {
1608 Some(c) => lo <= c && c <= hi,
9cc50fc6 1609 _ => false,
1a4d82fc
JJ
1610 }
1611}
1612
9cc50fc6
SL
1613fn is_dec_digit(c: Option<char>) -> bool {
1614 return in_range(c, '0', '9');
1615}
1a4d82fc
JJ
1616
1617pub fn is_doc_comment(s: &str) -> bool {
9cc50fc6
SL
1618 let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1619 s.starts_with("//!");
1a4d82fc
JJ
1620 debug!("is {:?} a doc comment? {}", s, res);
1621 res
1622}
1623
1624pub fn is_block_doc_comment(s: &str) -> bool {
9cc50fc6
SL
1625 // Prevent `/**/` from being parsed as a doc comment
1626 let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1627 s.starts_with("/*!")) && s.len() >= 5;
1a4d82fc
JJ
1628 debug!("is {:?} a doc comment? {}", s, res);
1629 res
1630}
1631
1632fn ident_start(c: Option<char>) -> bool {
9cc50fc6
SL
1633 let c = match c {
1634 Some(c) => c,
1635 None => return false,
1636 };
1a4d82fc 1637
9cc50fc6 1638 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1a4d82fc
JJ
1639}
1640
1641fn ident_continue(c: Option<char>) -> bool {
9cc50fc6
SL
1642 let c = match c {
1643 Some(c) => c,
1644 None => return false,
1645 };
1a4d82fc 1646
9cc50fc6
SL
1647 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1648 (c > '\x7f' && c.is_xid_continue())
1a4d82fc
JJ
1649}
1650
1651#[cfg(test)]
d9579d0f 1652mod tests {
1a4d82fc
JJ
1653 use super::*;
1654
1655 use codemap::{BytePos, CodeMap, Span, NO_EXPANSION};
9cc50fc6 1656 use errors;
1a4d82fc 1657 use parse::token;
9cc50fc6 1658 use parse::token::str_to_ident;
c34b1796 1659 use std::io;
9cc50fc6 1660 use std::rc::Rc;
1a4d82fc 1661
9cc50fc6 1662 fn mk_sh(cm: Rc<CodeMap>) -> errors::Handler {
c34b1796 1663 // FIXME (#22405): Replace `Box::new` with `box` here when/if possible.
9cc50fc6
SL
1664 let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), None, cm);
1665 errors::Handler::with_emitter(true, false, Box::new(emitter))
1a4d82fc
JJ
1666 }
1667
1668 // open a string reader for the given string
9cc50fc6
SL
1669 fn setup<'a>(cm: &CodeMap,
1670 span_handler: &'a errors::Handler,
1671 teststr: String)
1672 -> StringReader<'a> {
1673 let fm = cm.new_filemap("zebra.rs".to_string(), teststr);
1a4d82fc
JJ
1674 StringReader::new(span_handler, fm)
1675 }
1676
9cc50fc6
SL
1677 #[test]
1678 fn t1() {
1679 let cm = Rc::new(CodeMap::new());
1680 let sh = mk_sh(cm.clone());
1681 let mut string_reader = setup(&cm,
1682 &sh,
1683 "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1684 .to_string());
1a4d82fc
JJ
1685 let id = str_to_ident("fn");
1686 assert_eq!(string_reader.next_token().tok, token::Comment);
1687 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1688 let tok1 = string_reader.next_token();
9cc50fc6
SL
1689 let tok2 = TokenAndSpan {
1690 tok: token::Ident(id, token::Plain),
1691 sp: Span {
1692 lo: BytePos(21),
1693 hi: BytePos(23),
1694 expn_id: NO_EXPANSION,
1695 },
1696 };
1697 assert_eq!(tok1, tok2);
1a4d82fc
JJ
1698 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1699 // the 'main' id is already read:
1700 assert_eq!(string_reader.last_pos.clone(), BytePos(28));
1701 // read another token:
1702 let tok3 = string_reader.next_token();
9cc50fc6
SL
1703 let tok4 = TokenAndSpan {
1704 tok: token::Ident(str_to_ident("main"), token::Plain),
1705 sp: Span {
1706 lo: BytePos(24),
1707 hi: BytePos(28),
1708 expn_id: NO_EXPANSION,
1709 },
1710 };
1711 assert_eq!(tok3, tok4);
1a4d82fc
JJ
1712 // the lparen is already read:
1713 assert_eq!(string_reader.last_pos.clone(), BytePos(29))
1714 }
1715
1716 // check that the given reader produces the desired stream
1717 // of tokens (stop checking after exhausting the expected vec)
9cc50fc6 1718 fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
85aaf69f 1719 for expected_tok in &expected {
1a4d82fc
JJ
1720 assert_eq!(&string_reader.next_token().tok, expected_tok);
1721 }
1722 }
1723
1724 // make the identifier by looking up the string in the interner
1725 fn mk_ident(id: &str, style: token::IdentStyle) -> token::Token {
1726 token::Ident(str_to_ident(id), style)
1727 }
1728
9cc50fc6
SL
1729 #[test]
1730 fn doublecolonparsing() {
1731 let cm = Rc::new(CodeMap::new());
1732 let sh = mk_sh(cm.clone());
1733 check_tokenization(setup(&cm, &sh, "a b".to_string()),
1a4d82fc
JJ
1734 vec![mk_ident("a", token::Plain),
1735 token::Whitespace,
1736 mk_ident("b", token::Plain)]);
1737 }
1738
9cc50fc6
SL
1739 #[test]
1740 fn dcparsing_2() {
1741 let cm = Rc::new(CodeMap::new());
1742 let sh = mk_sh(cm.clone());
1743 check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1744 vec![mk_ident("a", token::ModName),
1a4d82fc
JJ
1745 token::ModSep,
1746 mk_ident("b", token::Plain)]);
1747 }
1748
9cc50fc6
SL
1749 #[test]
1750 fn dcparsing_3() {
1751 let cm = Rc::new(CodeMap::new());
1752 let sh = mk_sh(cm.clone());
1753 check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1a4d82fc
JJ
1754 vec![mk_ident("a", token::Plain),
1755 token::Whitespace,
1756 token::ModSep,
1757 mk_ident("b", token::Plain)]);
1758 }
1759
9cc50fc6
SL
1760 #[test]
1761 fn dcparsing_4() {
1762 let cm = Rc::new(CodeMap::new());
1763 let sh = mk_sh(cm.clone());
1764 check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1765 vec![mk_ident("a", token::ModName),
1a4d82fc
JJ
1766 token::ModSep,
1767 token::Whitespace,
1768 mk_ident("b", token::Plain)]);
1769 }
1770
9cc50fc6
SL
1771 #[test]
1772 fn character_a() {
1773 let cm = Rc::new(CodeMap::new());
1774 let sh = mk_sh(cm.clone());
1775 assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1a4d82fc
JJ
1776 token::Literal(token::Char(token::intern("a")), None));
1777 }
1778
9cc50fc6
SL
1779 #[test]
1780 fn character_space() {
1781 let cm = Rc::new(CodeMap::new());
1782 let sh = mk_sh(cm.clone());
1783 assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1a4d82fc
JJ
1784 token::Literal(token::Char(token::intern(" ")), None));
1785 }
1786
9cc50fc6
SL
1787 #[test]
1788 fn character_escaped() {
1789 let cm = Rc::new(CodeMap::new());
1790 let sh = mk_sh(cm.clone());
1791 assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1a4d82fc
JJ
1792 token::Literal(token::Char(token::intern("\\n")), None));
1793 }
1794
9cc50fc6
SL
1795 #[test]
1796 fn lifetime_name() {
1797 let cm = Rc::new(CodeMap::new());
1798 let sh = mk_sh(cm.clone());
1799 assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1a4d82fc
JJ
1800 token::Lifetime(token::str_to_ident("'abc")));
1801 }
1802
9cc50fc6
SL
1803 #[test]
1804 fn raw_string() {
1805 let cm = Rc::new(CodeMap::new());
1806 let sh = mk_sh(cm.clone());
1807 assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1808 .next_token()
1809 .tok,
1a4d82fc
JJ
1810 token::Literal(token::StrRaw(token::intern("\"#a\\b\x00c\""), 3), None));
1811 }
1812
9cc50fc6
SL
1813 #[test]
1814 fn literal_suffixes() {
1815 let cm = Rc::new(CodeMap::new());
1816 let sh = mk_sh(cm.clone());
1a4d82fc
JJ
1817 macro_rules! test {
1818 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
9cc50fc6 1819 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1a4d82fc
JJ
1820 token::Literal(token::$tok_type(token::intern($tok_contents)),
1821 Some(token::intern("suffix"))));
1822 // with a whitespace separator:
9cc50fc6 1823 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1a4d82fc
JJ
1824 token::Literal(token::$tok_type(token::intern($tok_contents)),
1825 None));
1826 }}
1827 }
1828
1829 test!("'a'", Char, "a");
1830 test!("b'a'", Byte, "a");
1831 test!("\"a\"", Str_, "a");
e9174d1e 1832 test!("b\"a\"", ByteStr, "a");
1a4d82fc
JJ
1833 test!("1234", Integer, "1234");
1834 test!("0b101", Integer, "0b101");
1835 test!("0xABC", Integer, "0xABC");
1836 test!("1.0", Float, "1.0");
1837 test!("1.0e10", Float, "1.0e10");
1838
9cc50fc6 1839 assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1a4d82fc 1840 token::Literal(token::Integer(token::intern("2")),
85aaf69f 1841 Some(token::intern("us"))));
9cc50fc6 1842 assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1a4d82fc
JJ
1843 token::Literal(token::StrRaw(token::intern("raw"), 3),
1844 Some(token::intern("suffix"))));
9cc50fc6 1845 assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
e9174d1e 1846 token::Literal(token::ByteStrRaw(token::intern("raw"), 3),
1a4d82fc
JJ
1847 Some(token::intern("suffix"))));
1848 }
1849
9cc50fc6
SL
1850 #[test]
1851 fn line_doc_comments() {
1a4d82fc
JJ
1852 assert!(is_doc_comment("///"));
1853 assert!(is_doc_comment("/// blah"));
1854 assert!(!is_doc_comment("////"));
1855 }
1856
9cc50fc6
SL
1857 #[test]
1858 fn nested_block_comments() {
1859 let cm = Rc::new(CodeMap::new());
1860 let sh = mk_sh(cm.clone());
1861 let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1a4d82fc 1862 match lexer.next_token().tok {
9cc50fc6
SL
1863 token::Comment => {}
1864 _ => panic!("expected a comment!"),
1a4d82fc 1865 }
9cc50fc6
SL
1866 assert_eq!(lexer.next_token().tok,
1867 token::Literal(token::Char(token::intern("a")), None));
1a4d82fc
JJ
1868 }
1869
9cc50fc6
SL
1870 #[test]
1871 fn crlf_comments() {
1872 let cm = Rc::new(CodeMap::new());
1873 let sh = mk_sh(cm.clone());
1874 let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
62682a34
SL
1875 let comment = lexer.next_token();
1876 assert_eq!(comment.tok, token::Comment);
1877 assert_eq!(comment.sp, ::codemap::mk_sp(BytePos(0), BytePos(7)));
1878 assert_eq!(lexer.next_token().tok, token::Whitespace);
9cc50fc6
SL
1879 assert_eq!(lexer.next_token().tok,
1880 token::DocComment(token::intern("/// test")));
62682a34 1881 }
1a4d82fc 1882}