src/libsyntax/parse/lexer/mod.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast::{self, Ident};
  12 use syntax_pos::{self, BytePos, CharPos, Pos, Span};
  13 use codemap::CodeMap;
  14 use errors::{FatalError, DiagnosticBuilder};
  15 use parse::{token, ParseSess};
  16 use str::char_at;
  17 use symbol::{Symbol, keywords};
  18 use std_unicode::property::Pattern_White_Space;
  19
  20 use std::borrow::Cow;
  21 use std::char;
  22 use std::mem::replace;
  23 use std::rc::Rc;
  24
  25 pub mod comments;
  26 mod tokentrees;
  27 mod unicode_chars;
  28
  29 #[derive(Clone, PartialEq, Eq, Debug)]
  30 pub struct TokenAndSpan {
  31     pub tok: token::Token,
  32     pub sp: Span,
  33 }
  34
  35 impl Default for TokenAndSpan {
  36     fn default() -> Self {
  37         TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP }
  38     }
  39 }
  40
  41 pub struct StringReader<'a> {
  42     pub sess: &'a ParseSess,
  43     /// The absolute offset within the codemap of the next character to read
  44     pub next_pos: BytePos,
  45     /// The absolute offset within the codemap of the current character
  46     pub pos: BytePos,
  47     /// The column of the next character to read
  48     pub col: CharPos,
  49     /// The current character (which has been read from self.pos)
  50     pub ch: Option<char>,
  51     pub filemap: Rc<syntax_pos::FileMap>,
  52     /// If Some, stop reading the source at this position (inclusive).
  53     pub terminator: Option<BytePos>,
  54     /// Whether to record new-lines in filemap. This is only necessary the first
  55     /// time a filemap is lexed. If part of a filemap is being re-lexed, this
  56     /// should be set to false.
  57     pub save_new_lines: bool,
  58     // cached:
  59     pub peek_tok: token::Token,
  60     pub peek_span: Span,
  61     pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
  62     // cache a direct reference to the source text, so that we don't have to
  63     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
  64     source_text: Rc<String>,
  65     /// Stack of open delimiters and their spans. Used for error message.
  66     token: token::Token,
  67     span: Span,
  68     open_braces: Vec<(token::DelimToken, Span)>,
  69 }
  70
  71 impl<'a> StringReader<'a> {
  72     fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
  73         let res = self.try_next_token();
  74         self.unwrap_or_abort(res)
  75     }
  76     fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
  77         match res {
  78             Ok(tok) => tok,
  79             Err(_) => {
  80                 self.emit_fatal_errors();
  81                 panic!(FatalError);
  82             }
  83         }
  84     }
  85     fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
  86         let mut t = self.try_next_token()?;
  87         loop {
  88             match t.tok {
  89                 token::Whitespace | token::Comment | token::Shebang(_) => {
  90                     t = self.try_next_token()?;
  91                 }
  92                 _ => break,
  93             }
  94         }
  95         self.token = t.tok.clone();
  96         self.span = t.sp;
  97         Ok(t)
  98     }
  99     pub fn real_token(&mut self) -> TokenAndSpan {
 100         let res = self.try_real_token();
 101         self.unwrap_or_abort(res)
 102     }
 103     fn is_eof(&self) -> bool {
 104         if self.ch.is_none() {
 105             return true;
 106         }
 107
 108         match self.terminator {
 109             Some(t) => self.next_pos > t,
 110             None => false,
 111         }
 112     }
 113     /// Return the next token. EFFECT: advances the string_reader.
 114     pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
 115         assert!(self.fatal_errs.is_empty());
 116         let ret_val = TokenAndSpan {
 117             tok: replace(&mut self.peek_tok, token::Underscore),
 118             sp: self.peek_span,
 119         };
 120         self.advance_token()?;
 121         Ok(ret_val)
 122     }
 123     fn fatal(&self, m: &str) -> FatalError {
 124         self.fatal_span(self.peek_span, m)
 125     }
 126     pub fn emit_fatal_errors(&mut self) {
 127         for err in &mut self.fatal_errs {
 128             err.emit();
 129         }
 130         self.fatal_errs.clear();
 131     }
 132     pub fn peek(&self) -> TokenAndSpan {
 133         // FIXME(pcwalton): Bad copy!
 134         TokenAndSpan {
 135             tok: self.peek_tok.clone(),
 136             sp: self.peek_span,
 137         }
 138     }
 139 }
 140
 141 impl<'a> StringReader<'a> {
 142     /// For comments.rs, which hackily pokes into next_pos and ch
 143     pub fn new_raw<'b>(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 144         let mut sr = StringReader::new_raw_internal(sess, filemap);
 145         sr.bump();
 146         sr
 147     }
 148
 149     fn new_raw_internal(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 150         if filemap.src.is_none() {
 151             sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
 152                                               filemap.name));
 153         }
 154
 155         let source_text = (*filemap.src.as_ref().unwrap()).clone();
 156
 157         StringReader {
 158             sess: sess,
 159             next_pos: filemap.start_pos,
 160             pos: filemap.start_pos,
 161             col: CharPos(0),
 162             ch: Some('\n'),
 163             filemap: filemap,
 164             terminator: None,
 165             save_new_lines: true,
 166             // dummy values; not read
 167             peek_tok: token::Eof,
 168             peek_span: syntax_pos::DUMMY_SP,
 169             source_text: source_text,
 170             fatal_errs: Vec::new(),
 171             token: token::Eof,
 172             span: syntax_pos::DUMMY_SP,
 173             open_braces: Vec::new(),
 174         }
 175     }
 176
 177     pub fn new(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 178         let mut sr = StringReader::new_raw(sess, filemap);
 179         if let Err(_) = sr.advance_token() {
 180             sr.emit_fatal_errors();
 181             panic!(FatalError);
 182         }
 183         sr
 184     }
 185
 186     pub fn ch_is(&self, c: char) -> bool {
 187         self.ch == Some(c)
 188     }
 189
 190     /// Report a fatal lexical error with a given span.
 191     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 192         self.sess.span_diagnostic.span_fatal(sp, m)
 193     }
 194
 195     /// Report a lexical error with a given span.
 196     pub fn err_span(&self, sp: Span, m: &str) {
 197         self.sess.span_diagnostic.span_err(sp, m)
 198     }
 199
 200
 201     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 202     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 203         self.fatal_span(syntax_pos::mk_sp(from_pos, to_pos), m)
 204     }
 205
 206     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 207     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 208         self.err_span(syntax_pos::mk_sp(from_pos, to_pos), m)
 209     }
 210
 211     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 212     /// escaped character to the error message
 213     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
 214         let mut m = m.to_string();
 215         m.push_str(": ");
 216         for c in c.escape_default() {
 217             m.push(c)
 218         }
 219         self.fatal_span_(from_pos, to_pos, &m[..])
 220     }
 221     fn struct_fatal_span_char(&self,
 222                               from_pos: BytePos,
 223                               to_pos: BytePos,
 224                               m: &str,
 225                               c: char)
 226                               -> DiagnosticBuilder<'a> {
 227         let mut m = m.to_string();
 228         m.push_str(": ");
 229         for c in c.escape_default() {
 230             m.push(c)
 231         }
 232         self.sess.span_diagnostic.struct_span_fatal(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
 233     }
 234
 235     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 236     /// escaped character to the error message
 237     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
 238         let mut m = m.to_string();
 239         m.push_str(": ");
 240         for c in c.escape_default() {
 241             m.push(c)
 242         }
 243         self.err_span_(from_pos, to_pos, &m[..]);
 244     }
 245     fn struct_err_span_char(&self,
 246                             from_pos: BytePos,
 247                             to_pos: BytePos,
 248                             m: &str,
 249                             c: char)
 250                             -> DiagnosticBuilder<'a> {
 251         let mut m = m.to_string();
 252         m.push_str(": ");
 253         for c in c.escape_default() {
 254             m.push(c)
 255         }
 256         self.sess.span_diagnostic.struct_span_err(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
 257     }
 258
 259     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
 260     /// offending string to the error message
 261     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
 262         m.push_str(": ");
 263         let from = self.byte_offset(from_pos).to_usize();
 264         let to = self.byte_offset(to_pos).to_usize();
 265         m.push_str(&self.source_text[from..to]);
 266         self.fatal_span_(from_pos, to_pos, &m[..])
 267     }
 268
 269     /// Advance peek_tok and peek_span to refer to the next token, and
 270     /// possibly update the interner.
 271     fn advance_token(&mut self) -> Result<(), ()> {
 272         match self.scan_whitespace_or_comment() {
 273             Some(comment) => {
 274                 self.peek_span = comment.sp;
 275                 self.peek_tok = comment.tok;
 276             }
 277             None => {
 278                 if self.is_eof() {
 279                     self.peek_tok = token::Eof;
 280                     self.peek_span = syntax_pos::mk_sp(self.filemap.end_pos, self.filemap.end_pos);
 281                 } else {
 282                     let start_bytepos = self.pos;
 283                     self.peek_tok = self.next_token_inner()?;
 284                     self.peek_span = syntax_pos::mk_sp(start_bytepos, self.pos);
 285                 };
 286             }
 287         }
 288         Ok(())
 289     }
 290
 291     fn byte_offset(&self, pos: BytePos) -> BytePos {
 292         (pos - self.filemap.start_pos)
 293     }
 294
 295     /// Calls `f` with a string slice of the source text spanning from `start`
 296     /// up to but excluding `self.pos`, meaning the slice does not include
 297     /// the character `self.ch`.
 298     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
 299         where F: FnOnce(&str) -> T
 300     {
 301         self.with_str_from_to(start, self.pos, f)
 302     }
 303
 304     /// Create a Name from a given offset to the current offset, each
 305     /// adjusted 1 towards each other (assumes that on either side there is a
 306     /// single-byte delimiter).
 307     pub fn name_from(&self, start: BytePos) -> ast::Name {
 308         debug!("taking an ident from {:?} to {:?}", start, self.pos);
 309         self.with_str_from(start, Symbol::intern)
 310     }
 311
 312     /// As name_from, with an explicit endpoint.
 313     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
 314         debug!("taking an ident from {:?} to {:?}", start, end);
 315         self.with_str_from_to(start, end, Symbol::intern)
 316     }
 317
 318     /// Calls `f` with a string slice of the source text spanning from `start`
 319     /// up to but excluding `end`.
 320     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
 321         where F: FnOnce(&str) -> T
 322     {
 323         f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
 324     }
 325
 326     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 327     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 328         let mut i = 0;
 329         while i < s.len() {
 330             let ch = char_at(s, i);
 331             let next = i + ch.len_utf8();
 332             if ch == '\r' {
 333                 if next < s.len() && char_at(s, next) == '\n' {
 334                     return translate_crlf_(self, start, s, errmsg, i).into();
 335                 }
 336                 let pos = start + BytePos(i as u32);
 337                 let end_pos = start + BytePos(next as u32);
 338                 self.err_span_(pos, end_pos, errmsg);
 339             }
 340             i = next;
 341         }
 342         return s.into();
 343
 344         fn translate_crlf_(rdr: &StringReader,
 345                            start: BytePos,
 346                            s: &str,
 347                            errmsg: &str,
 348                            mut i: usize)
 349                            -> String {
 350             let mut buf = String::with_capacity(s.len());
 351             let mut j = 0;
 352             while i < s.len() {
 353                 let ch = char_at(s, i);
 354                 let next = i + ch.len_utf8();
 355                 if ch == '\r' {
 356                     if j < i {
 357                         buf.push_str(&s[j..i]);
 358                     }
 359                     j = next;
 360                     if next >= s.len() || char_at(s, next) != '\n' {
 361                         let pos = start + BytePos(i as u32);
 362                         let end_pos = start + BytePos(next as u32);
 363                         rdr.err_span_(pos, end_pos, errmsg);
 364                     }
 365                 }
 366                 i = next;
 367             }
 368             if j < s.len() {
 369                 buf.push_str(&s[j..]);
 370             }
 371             buf
 372         }
 373     }
 374
 375
 376     /// Advance the StringReader by one character. If a newline is
 377     /// discovered, add it to the FileMap's list of line start offsets.
 378     pub fn bump(&mut self) {
 379         let new_pos = self.next_pos;
 380         let new_byte_offset = self.byte_offset(new_pos).to_usize();
 381         if new_byte_offset < self.source_text.len() {
 382             let old_ch_is_newline = self.ch.unwrap() == '\n';
 383             let new_ch = char_at(&self.source_text, new_byte_offset);
 384             let new_ch_len = new_ch.len_utf8();
 385
 386             self.ch = Some(new_ch);
 387             self.pos = new_pos;
 388             self.next_pos = new_pos + Pos::from_usize(new_ch_len);
 389             if old_ch_is_newline {
 390                 if self.save_new_lines {
 391                     self.filemap.next_line(self.pos);
 392                 }
 393                 self.col = CharPos(0);
 394             } else {
 395                 self.col = self.col + CharPos(1);
 396             }
 397             if new_ch_len > 1 {
 398                 self.filemap.record_multibyte_char(self.pos, new_ch_len);
 399             }
 400         } else {
 401             self.ch = None;
 402             self.pos = new_pos;
 403         }
 404     }
 405
 406     pub fn nextch(&self) -> Option<char> {
 407         let offset = self.byte_offset(self.next_pos).to_usize();
 408         if offset < self.source_text.len() {
 409             Some(char_at(&self.source_text, offset))
 410         } else {
 411             None
 412         }
 413     }
 414
 415     pub fn nextch_is(&self, c: char) -> bool {
 416         self.nextch() == Some(c)
 417     }
 418
 419     pub fn nextnextch(&self) -> Option<char> {
 420         let offset = self.byte_offset(self.next_pos).to_usize();
 421         let s = &self.source_text[..];
 422         if offset >= s.len() {
 423             return None;
 424         }
 425         let next = offset + char_at(s, offset).len_utf8();
 426         if next < s.len() {
 427             Some(char_at(s, next))
 428         } else {
 429             None
 430         }
 431     }
 432
 433     pub fn nextnextch_is(&self, c: char) -> bool {
 434         self.nextnextch() == Some(c)
 435     }
 436
 437     /// Eats <XID_start><XID_continue>*, if possible.
 438     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
 439         if !ident_start(self.ch) {
 440             return None;
 441         }
 442         let start = self.pos;
 443         while ident_continue(self.ch) {
 444             self.bump();
 445         }
 446
 447         self.with_str_from(start, |string| {
 448             if string == "_" {
 449                 None
 450             } else {
 451                 Some(Symbol::intern(string))
 452             }
 453         })
 454     }
 455
 456     /// PRECONDITION: self.ch is not whitespace
 457     /// Eats any kind of comment.
 458     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
 459         if let Some(c) = self.ch {
 460             if c.is_whitespace() {
 461                 let msg = "called consume_any_line_comment, but there was whitespace";
 462                 self.sess.span_diagnostic.span_err(syntax_pos::mk_sp(self.pos, self.pos), msg);
 463             }
 464         }
 465
 466         if self.ch_is('/') {
 467             match self.nextch() {
 468                 Some('/') => {
 469                     self.bump();
 470                     self.bump();
 471
 472                     // line comments starting with "///" or "//!" are doc-comments
 473                     let doc_comment = self.ch_is('/') || self.ch_is('!');
 474                     let start_bpos = self.pos - BytePos(2);
 475
 476                     while !self.is_eof() {
 477                         match self.ch.unwrap() {
 478                             '\n' => break,
 479                             '\r' => {
 480                                 if self.nextch_is('\n') {
 481                                     // CRLF
 482                                     break;
 483                                 } else if doc_comment {
 484                                     self.err_span_(self.pos,
 485                                                    self.next_pos,
 486                                                    "bare CR not allowed in doc-comment");
 487                                 }
 488                             }
 489                             _ => (),
 490                         }
 491                         self.bump();
 492                     }
 493
 494                     return if doc_comment {
 495                         self.with_str_from(start_bpos, |string| {
 496                             // comments with only more "/"s are not doc comments
 497                             let tok = if is_doc_comment(string) {
 498                                 token::DocComment(Symbol::intern(string))
 499                             } else {
 500                                 token::Comment
 501                             };
 502
 503                             Some(TokenAndSpan {
 504                                 tok: tok,
 505                                 sp: syntax_pos::mk_sp(start_bpos, self.pos),
 506                             })
 507                         })
 508                     } else {
 509                         Some(TokenAndSpan {
 510                             tok: token::Comment,
 511                             sp: syntax_pos::mk_sp(start_bpos, self.pos),
 512                         })
 513                     };
 514                 }
 515                 Some('*') => {
 516                     self.bump();
 517                     self.bump();
 518                     self.scan_block_comment()
 519                 }
 520                 _ => None,
 521             }
 522         } else if self.ch_is('#') {
 523             if self.nextch_is('!') {
 524
 525                 // Parse an inner attribute.
 526                 if self.nextnextch_is('[') {
 527                     return None;
 528                 }
 529
 530                 // I guess this is the only way to figure out if
 531                 // we're at the beginning of the file...
 532                 let cmap = CodeMap::new();
 533                 cmap.files.borrow_mut().push(self.filemap.clone());
 534                 let loc = cmap.lookup_char_pos_adj(self.pos);
 535                 debug!("Skipping a shebang");
 536                 if loc.line == 1 && loc.col == CharPos(0) {
 537                     // FIXME: Add shebang "token", return it
 538                     let start = self.pos;
 539                     while !self.ch_is('\n') && !self.is_eof() {
 540                         self.bump();
 541                     }
 542                     return Some(TokenAndSpan {
 543                         tok: token::Shebang(self.name_from(start)),
 544                         sp: syntax_pos::mk_sp(start, self.pos),
 545                     });
 546                 }
 547             }
 548             None
 549         } else {
 550             None
 551         }
 552     }
 553
 554     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
 555     /// return None.
 556     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
 557         match self.ch.unwrap_or('\0') {
 558             // # to handle shebang at start of file -- this is the entry point
 559             // for skipping over all "junk"
 560             '/' | '#' => {
 561                 let c = self.scan_comment();
 562                 debug!("scanning a comment {:?}", c);
 563                 c
 564             },
 565             c if is_pattern_whitespace(Some(c)) => {
 566                 let start_bpos = self.pos;
 567                 while is_pattern_whitespace(self.ch) {
 568                     self.bump();
 569                 }
 570                 let c = Some(TokenAndSpan {
 571                     tok: token::Whitespace,
 572                     sp: syntax_pos::mk_sp(start_bpos, self.pos),
 573                 });
 574                 debug!("scanning whitespace: {:?}", c);
 575                 c
 576             }
 577             _ => None,
 578         }
 579     }
 580
 581     /// Might return a sugared-doc-attr
 582     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
 583         // block comments starting with "/**" or "/*!" are doc-comments
 584         let is_doc_comment = self.ch_is('*') || self.ch_is('!');
 585         let start_bpos = self.pos - BytePos(2);
 586
 587         let mut level: isize = 1;
 588         let mut has_cr = false;
 589         while level > 0 {
 590             if self.is_eof() {
 591                 let msg = if is_doc_comment {
 592                     "unterminated block doc-comment"
 593                 } else {
 594                     "unterminated block comment"
 595                 };
 596                 let last_bpos = self.pos;
 597                 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
 598             }
 599             let n = self.ch.unwrap();
 600             match n {
 601                 '/' if self.nextch_is('*') => {
 602                     level += 1;
 603                     self.bump();
 604                 }
 605                 '*' if self.nextch_is('/') => {
 606                     level -= 1;
 607                     self.bump();
 608                 }
 609                 '\r' => {
 610                     has_cr = true;
 611                 }
 612                 _ => (),
 613             }
 614             self.bump();
 615         }
 616
 617         self.with_str_from(start_bpos, |string| {
 618             // but comments with only "*"s between two "/"s are not
 619             let tok = if is_block_doc_comment(string) {
 620                 let string = if has_cr {
 621                     self.translate_crlf(start_bpos,
 622                                         string,
 623                                         "bare CR not allowed in block doc-comment")
 624                 } else {
 625                     string.into()
 626                 };
 627                 token::DocComment(Symbol::intern(&string[..]))
 628             } else {
 629                 token::Comment
 630             };
 631
 632             Some(TokenAndSpan {
 633                 tok: tok,
 634                 sp: syntax_pos::mk_sp(start_bpos, self.pos),
 635             })
 636         })
 637     }
 638
 639     /// Scan through any digits (base `scan_radix`) or underscores,
 640     /// and return how many digits there were.
 641     ///
 642     /// `real_radix` represents the true radix of the number we're
 643     /// interested in, and errors will be emitted for any digits
 644     /// between `real_radix` and `scan_radix`.
 645     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
 646         assert!(real_radix <= scan_radix);
 647         let mut len = 0;
 648         loop {
 649             let c = self.ch;
 650             if c == Some('_') {
 651                 debug!("skipping a _");
 652                 self.bump();
 653                 continue;
 654             }
 655             match c.and_then(|cc| cc.to_digit(scan_radix)) {
 656                 Some(_) => {
 657                     debug!("{:?} in scan_digits", c);
 658                     // check that the hypothetical digit is actually
 659                     // in range for the true radix
 660                     if c.unwrap().to_digit(real_radix).is_none() {
 661                         self.err_span_(self.pos,
 662                                        self.next_pos,
 663                                        &format!("invalid digit for a base {} literal", real_radix));
 664                     }
 665                     len += 1;
 666                     self.bump();
 667                 }
 668                 _ => return len,
 669             }
 670         }
 671     }
 672
 673     /// Lex a LIT_INTEGER or a LIT_FLOAT
 674     fn scan_number(&mut self, c: char) -> token::Lit {
 675         let num_digits;
 676         let mut base = 10;
 677         let start_bpos = self.pos;
 678
 679         self.bump();
 680
 681         if c == '0' {
 682             match self.ch.unwrap_or('\0') {
 683                 'b' => {
 684                     self.bump();
 685                     base = 2;
 686                     num_digits = self.scan_digits(2, 10);
 687                 }
 688                 'o' => {
 689                     self.bump();
 690                     base = 8;
 691                     num_digits = self.scan_digits(8, 10);
 692                 }
 693                 'x' => {
 694                     self.bump();
 695                     base = 16;
 696                     num_digits = self.scan_digits(16, 16);
 697                 }
 698                 '0'...'9' | '_' | '.' => {
 699                     num_digits = self.scan_digits(10, 10) + 1;
 700                 }
 701                 _ => {
 702                     // just a 0
 703                     return token::Integer(self.name_from(start_bpos));
 704                 }
 705             }
 706         } else if c.is_digit(10) {
 707             num_digits = self.scan_digits(10, 10) + 1;
 708         } else {
 709             num_digits = 0;
 710         }
 711
 712         if num_digits == 0 {
 713             self.err_span_(start_bpos,
 714                            self.pos,
 715                            "no valid digits found for number");
 716             return token::Integer(Symbol::intern("0"));
 717         }
 718
 719         // might be a float, but don't be greedy if this is actually an
 720         // integer literal followed by field/method access or a range pattern
 721         // (`0..2` and `12.foo()`)
 722         if self.ch_is('.') && !self.nextch_is('.') &&
 723            !self.nextch()
 724                 .unwrap_or('\0')
 725                 .is_xid_start() {
 726             // might have stuff after the ., and if it does, it needs to start
 727             // with a number
 728             self.bump();
 729             if self.ch.unwrap_or('\0').is_digit(10) {
 730                 self.scan_digits(10, 10);
 731                 self.scan_float_exponent();
 732             }
 733             let pos = self.pos;
 734             self.check_float_base(start_bpos, pos, base);
 735             return token::Float(self.name_from(start_bpos));
 736         } else {
 737             // it might be a float if it has an exponent
 738             if self.ch_is('e') || self.ch_is('E') {
 739                 self.scan_float_exponent();
 740                 let pos = self.pos;
 741                 self.check_float_base(start_bpos, pos, base);
 742                 return token::Float(self.name_from(start_bpos));
 743             }
 744             // but we certainly have an integer!
 745             return token::Integer(self.name_from(start_bpos));
 746         }
 747     }
 748
 749     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
 750     /// error if too many or too few digits are encountered.
 751     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
 752         debug!("scanning {} digits until {:?}", n_digits, delim);
 753         let start_bpos = self.pos;
 754         let mut accum_int = 0;
 755
 756         let mut valid = true;
 757         for _ in 0..n_digits {
 758             if self.is_eof() {
 759                 let last_bpos = self.pos;
 760                 panic!(self.fatal_span_(start_bpos,
 761                                         last_bpos,
 762                                         "unterminated numeric character escape"));
 763             }
 764             if self.ch_is(delim) {
 765                 let last_bpos = self.pos;
 766                 self.err_span_(start_bpos,
 767                                last_bpos,
 768                                "numeric character escape is too short");
 769                 valid = false;
 770                 break;
 771             }
 772             let c = self.ch.unwrap_or('\x00');
 773             accum_int *= 16;
 774             accum_int += c.to_digit(16).unwrap_or_else(|| {
 775                 self.err_span_char(self.pos,
 776                                    self.next_pos,
 777                                    "invalid character in numeric character escape",
 778                                    c);
 779
 780                 valid = false;
 781                 0
 782             });
 783             self.bump();
 784         }
 785
 786         if below_0x7f_only && accum_int >= 0x80 {
 787             self.err_span_(start_bpos,
 788                            self.pos,
 789                            "this form of character escape may only be used with characters in \
 790                             the range [\\x00-\\x7f]");
 791             valid = false;
 792         }
 793
 794         match char::from_u32(accum_int) {
 795             Some(_) => valid,
 796             None => {
 797                 let last_bpos = self.pos;
 798                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
 799                 false
 800             }
 801         }
 802     }
 803
 804     /// Scan for a single (possibly escaped) byte or char
 805     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
 806     /// `start` is the position of `first_source_char`, which is already consumed.
 807     ///
 808     /// Returns true if there was a valid char/byte, false otherwise.
 809     fn scan_char_or_byte(&mut self,
 810                          start: BytePos,
 811                          first_source_char: char,
 812                          ascii_only: bool,
 813                          delim: char)
 814                          -> bool {
 815         match first_source_char {
 816             '\\' => {
 817                 // '\X' for some X must be a character constant:
 818                 let escaped = self.ch;
 819                 let escaped_pos = self.pos;
 820                 self.bump();
 821                 match escaped {
 822                     None => {}  // EOF here is an error that will be checked later.
 823                     Some(e) => {
 824                         return match e {
 825                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
 826                             'x' => self.scan_byte_escape(delim, !ascii_only),
 827                             'u' => {
 828                                 let valid = if self.ch_is('{') {
 829                                     self.scan_unicode_escape(delim) && !ascii_only
 830                                 } else {
 831                                     let span = syntax_pos::mk_sp(start, self.pos);
 832                                     self.sess.span_diagnostic
 833                                         .struct_span_err(span, "incorrect unicode escape sequence")
 834                                         .span_help(span,
 835                                                    "format of unicode escape sequences is \
 836                                                     `\\u{…}`")
 837                                         .emit();
 838                                     false
 839                                 };
 840                                 if ascii_only {
 841                                     self.err_span_(start,
 842                                                    self.pos,
 843                                                    "unicode escape sequences cannot be used as a \
 844                                                     byte or in a byte string");
 845                                 }
 846                                 valid
 847
 848                             }
 849                             '\n' if delim == '"' => {
 850                                 self.consume_whitespace();
 851                                 true
 852                             }
 853                             '\r' if delim == '"' && self.ch_is('\n') => {
 854                                 self.consume_whitespace();
 855                                 true
 856                             }
 857                             c => {
 858                                 let pos = self.pos;
 859                                 let mut err = self.struct_err_span_char(escaped_pos,
 860                                                                         pos,
 861                                                                         if ascii_only {
 862                                                                             "unknown byte escape"
 863                                                                         } else {
 864                                                                             "unknown character \
 865                                                                              escape"
 866                                                                         },
 867                                                                         c);
 868                                 if e == '\r' {
 869                                     err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
 870                                                   "this is an isolated carriage return; consider \
 871                                                    checking your editor and version control \
 872                                                    settings");
 873                                 }
 874                                 if (e == '{' || e == '}') && !ascii_only {
 875                                     err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
 876                                                   "if used in a formatting string, curly braces \
 877                                                    are escaped with `{{` and `}}`");
 878                                 }
 879                                 err.emit();
 880                                 false
 881                             }
 882                         }
 883                     }
 884                 }
 885             }
 886             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
 887                 let pos = self.pos;
 888                 self.err_span_char(start,
 889                                    pos,
 890                                    if ascii_only {
 891                                        "byte constant must be escaped"
 892                                    } else {
 893                                        "character constant must be escaped"
 894                                    },
 895                                    first_source_char);
 896                 return false;
 897             }
 898             '\r' => {
 899                 if self.ch_is('\n') {
 900                     self.bump();
 901                     return true;
 902                 } else {
 903                     self.err_span_(start,
 904                                    self.pos,
 905                                    "bare CR not allowed in string, use \\r instead");
 906                     return false;
 907                 }
 908             }
 909             _ => {
 910                 if ascii_only && first_source_char > '\x7F' {
 911                     let pos = self.pos;
 912                     self.err_span_(start,
 913                                    pos,
 914                                    "byte constant must be ASCII. Use a \\xHH escape for a \
 915                                     non-ASCII byte");
 916                     return false;
 917                 }
 918             }
 919         }
 920         true
 921     }
 922
 923     /// Scan over a \u{...} escape
 924     ///
 925     /// At this point, we have already seen the \ and the u, the { is the current character. We
 926     /// will read at least one digit, and up to 6, and pass over the }.
 927     fn scan_unicode_escape(&mut self, delim: char) -> bool {
 928         self.bump(); // past the {
 929         let start_bpos = self.pos;
 930         let mut count = 0;
 931         let mut accum_int = 0;
 932         let mut valid = true;
 933
 934         while !self.ch_is('}') && count <= 6 {
 935             let c = match self.ch {
 936                 Some(c) => c,
 937                 None => {
 938                     panic!(self.fatal_span_(start_bpos,
 939                                             self.pos,
 940                                             "unterminated unicode escape (found EOF)"));
 941                 }
 942             };
 943             accum_int *= 16;
 944             accum_int += c.to_digit(16).unwrap_or_else(|| {
 945                 if c == delim {
 946                     panic!(self.fatal_span_(self.pos,
 947                                             self.next_pos,
 948                                             "unterminated unicode escape (needed a `}`)"));
 949                 } else {
 950                     self.err_span_char(self.pos,
 951                                        self.next_pos,
 952                                        "invalid character in unicode escape",
 953                                        c);
 954                 }
 955                 valid = false;
 956                 0
 957             });
 958             self.bump();
 959             count += 1;
 960         }
 961
 962         if count > 6 {
 963             self.err_span_(start_bpos,
 964                            self.pos,
 965                            "overlong unicode escape (can have at most 6 hex digits)");
 966             valid = false;
 967         }
 968
 969         if valid && (char::from_u32(accum_int).is_none() || count == 0) {
 970             self.err_span_(start_bpos,
 971                            self.pos,
 972                            "invalid unicode character escape");
 973             valid = false;
 974         }
 975
 976         self.bump(); // past the ending }
 977         valid
 978     }
 979
 980     /// Scan over a float exponent.
 981     fn scan_float_exponent(&mut self) {
 982         if self.ch_is('e') || self.ch_is('E') {
 983             self.bump();
 984             if self.ch_is('-') || self.ch_is('+') {
 985                 self.bump();
 986             }
 987             if self.scan_digits(10, 10) == 0 {
 988                 self.err_span_(self.pos,
 989                                self.next_pos,
 990                                "expected at least one digit in exponent")
 991             }
 992         }
 993     }
 994
 995     /// Check that a base is valid for a floating literal, emitting a nice
 996     /// error if it isn't.
 997     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
 998         match base {
 999             16 => {
1000                 self.err_span_(start_bpos,
1001                                last_bpos,
1002                                "hexadecimal float literal is not supported")
1003             }
1004             8 => {
1005                 self.err_span_(start_bpos,
1006                                last_bpos,
1007                                "octal float literal is not supported")
1008             }
1009             2 => {
1010                 self.err_span_(start_bpos,
1011                                last_bpos,
1012                                "binary float literal is not supported")
1013             }
1014             _ => (),
1015         }
1016     }
1017
1018     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1019         self.bump();
1020         if self.ch_is('=') {
1021             self.bump();
1022             return token::BinOpEq(op);
1023         } else {
1024             return token::BinOp(op);
1025         }
1026     }
1027
1028     /// Return the next token from the string, advances the input past that
1029     /// token, and updates the interner
1030     fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1031         let c = self.ch;
1032         if ident_start(c) &&
1033            match (c.unwrap(), self.nextch(), self.nextnextch()) {
1034             // Note: r as in r" or r#" is part of a raw string literal,
1035             // b as in b' is part of a byte literal.
1036             // They are not identifiers, and are handled further down.
1037             ('r', Some('"'), _) |
1038             ('r', Some('#'), _) |
1039             ('b', Some('"'), _) |
1040             ('b', Some('\''), _) |
1041             ('b', Some('r'), Some('"')) |
1042             ('b', Some('r'), Some('#')) => false,
1043             _ => true,
1044         } {
1045             let start = self.pos;
1046             while ident_continue(self.ch) {
1047                 self.bump();
1048             }
1049
1050             return Ok(self.with_str_from(start, |string| {
1051                 if string == "_" {
1052                     token::Underscore
1053                 } else {
1054                     // FIXME: perform NFKC normalization here. (Issue #2253)
1055                     token::Ident(Ident::from_str(string))
1056                 }
1057             }));
1058         }
1059
1060         if is_dec_digit(c) {
1061             let num = self.scan_number(c.unwrap());
1062             let suffix = self.scan_optional_raw_name();
1063             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1064             return Ok(token::Literal(num, suffix));
1065         }
1066
1067         match c.expect("next_token_inner called at EOF") {
1068             // One-byte tokens.
1069             ';' => {
1070                 self.bump();
1071                 return Ok(token::Semi);
1072             }
1073             ',' => {
1074                 self.bump();
1075                 return Ok(token::Comma);
1076             }
1077             '.' => {
1078                 self.bump();
1079                 return if self.ch_is('.') {
1080                     self.bump();
1081                     if self.ch_is('.') {
1082                         self.bump();
1083                         Ok(token::DotDotDot)
1084                     } else {
1085                         Ok(token::DotDot)
1086                     }
1087                 } else {
1088                     Ok(token::Dot)
1089                 };
1090             }
1091             '(' => {
1092                 self.bump();
1093                 return Ok(token::OpenDelim(token::Paren));
1094             }
1095             ')' => {
1096                 self.bump();
1097                 return Ok(token::CloseDelim(token::Paren));
1098             }
1099             '{' => {
1100                 self.bump();
1101                 return Ok(token::OpenDelim(token::Brace));
1102             }
1103             '}' => {
1104                 self.bump();
1105                 return Ok(token::CloseDelim(token::Brace));
1106             }
1107             '[' => {
1108                 self.bump();
1109                 return Ok(token::OpenDelim(token::Bracket));
1110             }
1111             ']' => {
1112                 self.bump();
1113                 return Ok(token::CloseDelim(token::Bracket));
1114             }
1115             '@' => {
1116                 self.bump();
1117                 return Ok(token::At);
1118             }
1119             '#' => {
1120                 self.bump();
1121                 return Ok(token::Pound);
1122             }
1123             '~' => {
1124                 self.bump();
1125                 return Ok(token::Tilde);
1126             }
1127             '?' => {
1128                 self.bump();
1129                 return Ok(token::Question);
1130             }
1131             ':' => {
1132                 self.bump();
1133                 if self.ch_is(':') {
1134                     self.bump();
1135                     return Ok(token::ModSep);
1136                 } else {
1137                     return Ok(token::Colon);
1138                 }
1139             }
1140
1141             '$' => {
1142                 self.bump();
1143                 return Ok(token::Dollar);
1144             }
1145
1146             // Multi-byte tokens.
1147             '=' => {
1148                 self.bump();
1149                 if self.ch_is('=') {
1150                     self.bump();
1151                     return Ok(token::EqEq);
1152                 } else if self.ch_is('>') {
1153                     self.bump();
1154                     return Ok(token::FatArrow);
1155                 } else {
1156                     return Ok(token::Eq);
1157                 }
1158             }
1159             '!' => {
1160                 self.bump();
1161                 if self.ch_is('=') {
1162                     self.bump();
1163                     return Ok(token::Ne);
1164                 } else {
1165                     return Ok(token::Not);
1166                 }
1167             }
1168             '<' => {
1169                 self.bump();
1170                 match self.ch.unwrap_or('\x00') {
1171                     '=' => {
1172                         self.bump();
1173                         return Ok(token::Le);
1174                     }
1175                     '<' => {
1176                         return Ok(self.binop(token::Shl));
1177                     }
1178                     '-' => {
1179                         self.bump();
1180                         match self.ch.unwrap_or('\x00') {
1181                             _ => {
1182                                 return Ok(token::LArrow);
1183                             }
1184                         }
1185                     }
1186                     _ => {
1187                         return Ok(token::Lt);
1188                     }
1189                 }
1190             }
1191             '>' => {
1192                 self.bump();
1193                 match self.ch.unwrap_or('\x00') {
1194                     '=' => {
1195                         self.bump();
1196                         return Ok(token::Ge);
1197                     }
1198                     '>' => {
1199                         return Ok(self.binop(token::Shr));
1200                     }
1201                     _ => {
1202                         return Ok(token::Gt);
1203                     }
1204                 }
1205             }
1206             '\'' => {
1207                 // Either a character constant 'a' OR a lifetime name 'abc
1208                 let start_with_quote = self.pos;
1209                 self.bump();
1210                 let start = self.pos;
1211
1212                 // the eof will be picked up by the final `'` check below
1213                 let c2 = self.ch.unwrap_or('\x00');
1214                 self.bump();
1215
1216                 // If the character is an ident start not followed by another single
1217                 // quote, then this is a lifetime name:
1218                 if ident_start(Some(c2)) && !self.ch_is('\'') {
1219                     while ident_continue(self.ch) {
1220                         self.bump();
1221                     }
1222                     // lifetimes shouldn't end with a single quote
1223                     // if we find one, then this is an invalid character literal
1224                     if self.ch_is('\'') {
1225                         panic!(self.fatal_span_verbose(
1226                                start_with_quote, self.next_pos,
1227                                String::from("character literal may only contain one codepoint")));
1228
1229                     }
1230
1231                     // Include the leading `'` in the real identifier, for macro
1232                     // expansion purposes. See #12512 for the gory details of why
1233                     // this is necessary.
1234                     let ident = self.with_str_from(start, |lifetime_name| {
1235                         Ident::from_str(&format!("'{}", lifetime_name))
1236                     });
1237
1238                     // Conjure up a "keyword checking ident" to make sure that
1239                     // the lifetime name is not a keyword.
1240                     let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1241                         Ident::from_str(lifetime_name)
1242                     });
1243                     let keyword_checking_token = &token::Ident(keyword_checking_ident);
1244                     let last_bpos = self.pos;
1245                     if keyword_checking_token.is_any_keyword() &&
1246                        !keyword_checking_token.is_keyword(keywords::Static) {
1247                         self.err_span_(start, last_bpos, "lifetimes cannot use keyword names");
1248                     }
1249
1250                     return Ok(token::Lifetime(ident));
1251                 }
1252
1253                 let valid = self.scan_char_or_byte(start,
1254                                                    c2,
1255                                                    // ascii_only =
1256                                                    false,
1257                                                    '\'');
1258
1259                 if !self.ch_is('\'') {
1260                     panic!(self.fatal_span_verbose(
1261                            start_with_quote, self.pos,
1262                            String::from("character literal may only contain one codepoint")));
1263                 }
1264
1265                 let id = if valid {
1266                     self.name_from(start)
1267                 } else {
1268                     Symbol::intern("0")
1269                 };
1270                 self.bump(); // advance ch past token
1271                 let suffix = self.scan_optional_raw_name();
1272                 return Ok(token::Literal(token::Char(id), suffix));
1273             }
1274             'b' => {
1275                 self.bump();
1276                 let lit = match self.ch {
1277                     Some('\'') => self.scan_byte(),
1278                     Some('"') => self.scan_byte_string(),
1279                     Some('r') => self.scan_raw_byte_string(),
1280                     _ => unreachable!(),  // Should have been a token::Ident above.
1281                 };
1282                 let suffix = self.scan_optional_raw_name();
1283                 return Ok(token::Literal(lit, suffix));
1284             }
1285             '"' => {
1286                 let start_bpos = self.pos;
1287                 let mut valid = true;
1288                 self.bump();
1289                 while !self.ch_is('"') {
1290                     if self.is_eof() {
1291                         let last_bpos = self.pos;
1292                         panic!(self.fatal_span_(start_bpos,
1293                                                 last_bpos,
1294                                                 "unterminated double quote string"));
1295                     }
1296
1297                     let ch_start = self.pos;
1298                     let ch = self.ch.unwrap();
1299                     self.bump();
1300                     valid &= self.scan_char_or_byte(ch_start,
1301                                                     ch,
1302                                                     // ascii_only =
1303                                                     false,
1304                                                     '"');
1305                 }
1306                 // adjust for the ASCII " at the start of the literal
1307                 let id = if valid {
1308                     self.name_from(start_bpos + BytePos(1))
1309                 } else {
1310                     Symbol::intern("??")
1311                 };
1312                 self.bump();
1313                 let suffix = self.scan_optional_raw_name();
1314                 return Ok(token::Literal(token::Str_(id), suffix));
1315             }
1316             'r' => {
1317                 let start_bpos = self.pos;
1318                 self.bump();
1319                 let mut hash_count = 0;
1320                 while self.ch_is('#') {
1321                     self.bump();
1322                     hash_count += 1;
1323                 }
1324
1325                 if self.is_eof() {
1326                     let last_bpos = self.pos;
1327                     panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1328                 } else if !self.ch_is('"') {
1329                     let last_bpos = self.pos;
1330                     let curr_char = self.ch.unwrap();
1331                     panic!(self.fatal_span_char(start_bpos,
1332                                                 last_bpos,
1333                                                 "found invalid character; only `#` is allowed \
1334                                                  in raw string delimitation",
1335                                                 curr_char));
1336                 }
1337                 self.bump();
1338                 let content_start_bpos = self.pos;
1339                 let mut content_end_bpos;
1340                 let mut valid = true;
1341                 'outer: loop {
1342                     if self.is_eof() {
1343                         let last_bpos = self.pos;
1344                         panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1345                     }
1346                     // if self.ch_is('"') {
1347                     // content_end_bpos = self.pos;
1348                     // for _ in 0..hash_count {
1349                     // self.bump();
1350                     // if !self.ch_is('#') {
1351                     // continue 'outer;
1352                     let c = self.ch.unwrap();
1353                     match c {
1354                         '"' => {
1355                             content_end_bpos = self.pos;
1356                             for _ in 0..hash_count {
1357                                 self.bump();
1358                                 if !self.ch_is('#') {
1359                                     continue 'outer;
1360                                 }
1361                             }
1362                             break;
1363                         }
1364                         '\r' => {
1365                             if !self.nextch_is('\n') {
1366                                 let last_bpos = self.pos;
1367                                 self.err_span_(start_bpos,
1368                                                last_bpos,
1369                                                "bare CR not allowed in raw string, use \\r \
1370                                                 instead");
1371                                 valid = false;
1372                             }
1373                         }
1374                         _ => (),
1375                     }
1376                     self.bump();
1377                 }
1378                 self.bump();
1379                 let id = if valid {
1380                     self.name_from_to(content_start_bpos, content_end_bpos)
1381                 } else {
1382                     Symbol::intern("??")
1383                 };
1384                 let suffix = self.scan_optional_raw_name();
1385                 return Ok(token::Literal(token::StrRaw(id, hash_count), suffix));
1386             }
1387             '-' => {
1388                 if self.nextch_is('>') {
1389                     self.bump();
1390                     self.bump();
1391                     return Ok(token::RArrow);
1392                 } else {
1393                     return Ok(self.binop(token::Minus));
1394                 }
1395             }
1396             '&' => {
1397                 if self.nextch_is('&') {
1398                     self.bump();
1399                     self.bump();
1400                     return Ok(token::AndAnd);
1401                 } else {
1402                     return Ok(self.binop(token::And));
1403                 }
1404             }
1405             '|' => {
1406                 match self.nextch() {
1407                     Some('|') => {
1408                         self.bump();
1409                         self.bump();
1410                         return Ok(token::OrOr);
1411                     }
1412                     _ => {
1413                         return Ok(self.binop(token::Or));
1414                     }
1415                 }
1416             }
1417             '+' => {
1418                 return Ok(self.binop(token::Plus));
1419             }
1420             '*' => {
1421                 return Ok(self.binop(token::Star));
1422             }
1423             '/' => {
1424                 return Ok(self.binop(token::Slash));
1425             }
1426             '^' => {
1427                 return Ok(self.binop(token::Caret));
1428             }
1429             '%' => {
1430                 return Ok(self.binop(token::Percent));
1431             }
1432             c => {
1433                 let last_bpos = self.pos;
1434                 let bpos = self.next_pos;
1435                 let mut err = self.struct_fatal_span_char(last_bpos,
1436                                                           bpos,
1437                                                           "unknown start of token",
1438                                                           c);
1439                 unicode_chars::check_for_substitution(&self, c, &mut err);
1440                 self.fatal_errs.push(err);
1441                 Err(())
1442             }
1443         }
1444     }
1445
1446     fn consume_whitespace(&mut self) {
1447         while is_pattern_whitespace(self.ch) && !self.is_eof() {
1448             self.bump();
1449         }
1450     }
1451
1452     fn read_to_eol(&mut self) -> String {
1453         let mut val = String::new();
1454         while !self.ch_is('\n') && !self.is_eof() {
1455             val.push(self.ch.unwrap());
1456             self.bump();
1457         }
1458         if self.ch_is('\n') {
1459             self.bump();
1460         }
1461         return val;
1462     }
1463
1464     fn read_one_line_comment(&mut self) -> String {
1465         let val = self.read_to_eol();
1466         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1467                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1468         return val;
1469     }
1470
1471     fn consume_non_eol_whitespace(&mut self) {
1472         while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1473             self.bump();
1474         }
1475     }
1476
1477     fn peeking_at_comment(&self) -> bool {
1478         (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1479         // consider shebangs comments, but not inner attributes
1480         (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1481     }
1482
1483     fn scan_byte(&mut self) -> token::Lit {
1484         self.bump();
1485         let start = self.pos;
1486
1487         // the eof will be picked up by the final `'` check below
1488         let c2 = self.ch.unwrap_or('\x00');
1489         self.bump();
1490
1491         let valid = self.scan_char_or_byte(start,
1492                                            c2,
1493                                            // ascii_only =
1494                                            true,
1495                                            '\'');
1496         if !self.ch_is('\'') {
1497             // Byte offsetting here is okay because the
1498             // character before position `start` are an
1499             // ascii single quote and ascii 'b'.
1500             let pos = self.pos;
1501             panic!(self.fatal_span_verbose(start - BytePos(2),
1502                                            pos,
1503                                            "unterminated byte constant".to_string()));
1504         }
1505
1506         let id = if valid {
1507             self.name_from(start)
1508         } else {
1509             Symbol::intern("?")
1510         };
1511         self.bump(); // advance ch past token
1512         return token::Byte(id);
1513     }
1514
1515     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1516         self.scan_hex_digits(2, delim, below_0x7f_only)
1517     }
1518
1519     fn scan_byte_string(&mut self) -> token::Lit {
1520         self.bump();
1521         let start = self.pos;
1522         let mut valid = true;
1523
1524         while !self.ch_is('"') {
1525             if self.is_eof() {
1526                 let pos = self.pos;
1527                 panic!(self.fatal_span_(start, pos, "unterminated double quote byte string"));
1528             }
1529
1530             let ch_start = self.pos;
1531             let ch = self.ch.unwrap();
1532             self.bump();
1533             valid &= self.scan_char_or_byte(ch_start,
1534                                             ch,
1535                                             // ascii_only =
1536                                             true,
1537                                             '"');
1538         }
1539         let id = if valid {
1540             self.name_from(start)
1541         } else {
1542             Symbol::intern("??")
1543         };
1544         self.bump();
1545         return token::ByteStr(id);
1546     }
1547
1548     fn scan_raw_byte_string(&mut self) -> token::Lit {
1549         let start_bpos = self.pos;
1550         self.bump();
1551         let mut hash_count = 0;
1552         while self.ch_is('#') {
1553             self.bump();
1554             hash_count += 1;
1555         }
1556
1557         if self.is_eof() {
1558             let pos = self.pos;
1559             panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"));
1560         } else if !self.ch_is('"') {
1561             let pos = self.pos;
1562             let ch = self.ch.unwrap();
1563             panic!(self.fatal_span_char(start_bpos,
1564                                         pos,
1565                                         "found invalid character; only `#` is allowed in raw \
1566                                          string delimitation",
1567                                         ch));
1568         }
1569         self.bump();
1570         let content_start_bpos = self.pos;
1571         let mut content_end_bpos;
1572         'outer: loop {
1573             match self.ch {
1574                 None => {
1575                     let pos = self.pos;
1576                     panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"))
1577                 }
1578                 Some('"') => {
1579                     content_end_bpos = self.pos;
1580                     for _ in 0..hash_count {
1581                         self.bump();
1582                         if !self.ch_is('#') {
1583                             continue 'outer;
1584                         }
1585                     }
1586                     break;
1587                 }
1588                 Some(c) => {
1589                     if c > '\x7F' {
1590                         let pos = self.pos;
1591                         self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1592                     }
1593                 }
1594             }
1595             self.bump();
1596         }
1597         self.bump();
1598         return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1599                                  hash_count);
1600     }
1601 }
1602
1603 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1604 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1605 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1606     c.map_or(false, Pattern_White_Space)
1607 }
1608
1609 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1610     match c {
1611         Some(c) => lo <= c && c <= hi,
1612         _ => false,
1613     }
1614 }
1615
1616 fn is_dec_digit(c: Option<char>) -> bool {
1617     return in_range(c, '0', '9');
1618 }
1619
1620 pub fn is_doc_comment(s: &str) -> bool {
1621     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1622               s.starts_with("//!");
1623     debug!("is {:?} a doc comment? {}", s, res);
1624     res
1625 }
1626
1627 pub fn is_block_doc_comment(s: &str) -> bool {
1628     // Prevent `/**/` from being parsed as a doc comment
1629     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1630                s.starts_with("/*!")) && s.len() >= 5;
1631     debug!("is {:?} a doc comment? {}", s, res);
1632     res
1633 }
1634
1635 fn ident_start(c: Option<char>) -> bool {
1636     let c = match c {
1637         Some(c) => c,
1638         None => return false,
1639     };
1640
1641     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1642 }
1643
1644 fn ident_continue(c: Option<char>) -> bool {
1645     let c = match c {
1646         Some(c) => c,
1647         None => return false,
1648     };
1649
1650     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1651     (c > '\x7f' && c.is_xid_continue())
1652 }
1653
1654 #[cfg(test)]
1655 mod tests {
1656     use super::*;
1657
1658     use ast::{Ident, CrateConfig};
1659     use symbol::Symbol;
1660     use syntax_pos::{BytePos, Span, NO_EXPANSION};
1661     use codemap::CodeMap;
1662     use errors;
1663     use feature_gate::UnstableFeatures;
1664     use parse::token;
1665     use std::cell::RefCell;
1666     use std::io;
1667     use std::rc::Rc;
1668
1669     fn mk_sess(cm: Rc<CodeMap>) -> ParseSess {
1670         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), Some(cm.clone()));
1671         ParseSess {
1672             span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1673             unstable_features: UnstableFeatures::from_environment(),
1674             config: CrateConfig::new(),
1675             included_mod_stack: RefCell::new(Vec::new()),
1676             code_map: cm,
1677         }
1678     }
1679
1680     // open a string reader for the given string
1681     fn setup<'a>(cm: &CodeMap,
1682                  sess: &'a ParseSess,
1683                  teststr: String)
1684                  -> StringReader<'a> {
1685         let fm = cm.new_filemap("zebra.rs".to_string(), None, teststr);
1686         StringReader::new(sess, fm)
1687     }
1688
1689     #[test]
1690     fn t1() {
1691         let cm = Rc::new(CodeMap::new());
1692         let sh = mk_sess(cm.clone());
1693         let mut string_reader = setup(&cm,
1694                                       &sh,
1695                                       "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1696                                           .to_string());
1697         let id = Ident::from_str("fn");
1698         assert_eq!(string_reader.next_token().tok, token::Comment);
1699         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1700         let tok1 = string_reader.next_token();
1701         let tok2 = TokenAndSpan {
1702             tok: token::Ident(id),
1703             sp: Span {
1704                 lo: BytePos(21),
1705                 hi: BytePos(23),
1706                 expn_id: NO_EXPANSION,
1707             },
1708         };
1709         assert_eq!(tok1, tok2);
1710         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1711         // the 'main' id is already read:
1712         assert_eq!(string_reader.pos.clone(), BytePos(28));
1713         // read another token:
1714         let tok3 = string_reader.next_token();
1715         let tok4 = TokenAndSpan {
1716             tok: token::Ident(Ident::from_str("main")),
1717             sp: Span {
1718                 lo: BytePos(24),
1719                 hi: BytePos(28),
1720                 expn_id: NO_EXPANSION,
1721             },
1722         };
1723         assert_eq!(tok3, tok4);
1724         // the lparen is already read:
1725         assert_eq!(string_reader.pos.clone(), BytePos(29))
1726     }
1727
1728     // check that the given reader produces the desired stream
1729     // of tokens (stop checking after exhausting the expected vec)
1730     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1731         for expected_tok in &expected {
1732             assert_eq!(&string_reader.next_token().tok, expected_tok);
1733         }
1734     }
1735
1736     // make the identifier by looking up the string in the interner
1737     fn mk_ident(id: &str) -> token::Token {
1738         token::Ident(Ident::from_str(id))
1739     }
1740
1741     #[test]
1742     fn doublecolonparsing() {
1743         let cm = Rc::new(CodeMap::new());
1744         let sh = mk_sess(cm.clone());
1745         check_tokenization(setup(&cm, &sh, "a b".to_string()),
1746                            vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1747     }
1748
1749     #[test]
1750     fn dcparsing_2() {
1751         let cm = Rc::new(CodeMap::new());
1752         let sh = mk_sess(cm.clone());
1753         check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1754                            vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1755     }
1756
1757     #[test]
1758     fn dcparsing_3() {
1759         let cm = Rc::new(CodeMap::new());
1760         let sh = mk_sess(cm.clone());
1761         check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1762                            vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1763     }
1764
1765     #[test]
1766     fn dcparsing_4() {
1767         let cm = Rc::new(CodeMap::new());
1768         let sh = mk_sess(cm.clone());
1769         check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1770                            vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1771     }
1772
1773     #[test]
1774     fn character_a() {
1775         let cm = Rc::new(CodeMap::new());
1776         let sh = mk_sess(cm.clone());
1777         assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1778                    token::Literal(token::Char(Symbol::intern("a")), None));
1779     }
1780
1781     #[test]
1782     fn character_space() {
1783         let cm = Rc::new(CodeMap::new());
1784         let sh = mk_sess(cm.clone());
1785         assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1786                    token::Literal(token::Char(Symbol::intern(" ")), None));
1787     }
1788
1789     #[test]
1790     fn character_escaped() {
1791         let cm = Rc::new(CodeMap::new());
1792         let sh = mk_sess(cm.clone());
1793         assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1794                    token::Literal(token::Char(Symbol::intern("\\n")), None));
1795     }
1796
1797     #[test]
1798     fn lifetime_name() {
1799         let cm = Rc::new(CodeMap::new());
1800         let sh = mk_sess(cm.clone());
1801         assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1802                    token::Lifetime(Ident::from_str("'abc")));
1803     }
1804
1805     #[test]
1806     fn raw_string() {
1807         let cm = Rc::new(CodeMap::new());
1808         let sh = mk_sess(cm.clone());
1809         assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1810                        .next_token()
1811                        .tok,
1812                    token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1813     }
1814
1815     #[test]
1816     fn literal_suffixes() {
1817         let cm = Rc::new(CodeMap::new());
1818         let sh = mk_sess(cm.clone());
1819         macro_rules! test {
1820             ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1821                 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1822                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1823                                           Some(Symbol::intern("suffix"))));
1824                 // with a whitespace separator:
1825                 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1826                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1827                                           None));
1828             }}
1829         }
1830
1831         test!("'a'", Char, "a");
1832         test!("b'a'", Byte, "a");
1833         test!("\"a\"", Str_, "a");
1834         test!("b\"a\"", ByteStr, "a");
1835         test!("1234", Integer, "1234");
1836         test!("0b101", Integer, "0b101");
1837         test!("0xABC", Integer, "0xABC");
1838         test!("1.0", Float, "1.0");
1839         test!("1.0e10", Float, "1.0e10");
1840
1841         assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1842                    token::Literal(token::Integer(Symbol::intern("2")),
1843                                   Some(Symbol::intern("us"))));
1844         assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1845                    token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1846                                   Some(Symbol::intern("suffix"))));
1847         assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1848                    token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1849                                   Some(Symbol::intern("suffix"))));
1850     }
1851
1852     #[test]
1853     fn line_doc_comments() {
1854         assert!(is_doc_comment("///"));
1855         assert!(is_doc_comment("/// blah"));
1856         assert!(!is_doc_comment("////"));
1857     }
1858
1859     #[test]
1860     fn nested_block_comments() {
1861         let cm = Rc::new(CodeMap::new());
1862         let sh = mk_sess(cm.clone());
1863         let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1864         match lexer.next_token().tok {
1865             token::Comment => {}
1866             _ => panic!("expected a comment!"),
1867         }
1868         assert_eq!(lexer.next_token().tok,
1869                    token::Literal(token::Char(Symbol::intern("a")), None));
1870     }
1871
1872     #[test]
1873     fn crlf_comments() {
1874         let cm = Rc::new(CodeMap::new());
1875         let sh = mk_sess(cm.clone());
1876         let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1877         let comment = lexer.next_token();
1878         assert_eq!(comment.tok, token::Comment);
1879         assert_eq!(comment.sp, ::syntax_pos::mk_sp(BytePos(0), BytePos(7)));
1880         assert_eq!(lexer.next_token().tok, token::Whitespace);
1881         assert_eq!(lexer.next_token().tok,
1882                    token::DocComment(Symbol::intern("/// test")));
1883     }
1884 }