src/libsyntax/parse/lexer/mod.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast;
  12 use codemap::{BytePos, CharPos, CodeMap, Pos, Span};
  13 use codemap;
  14 use errors::{FatalError, Handler, DiagnosticBuilder};
  15 use ext::tt::transcribe::tt_next_token;
  16 use parse::token::str_to_ident;
  17 use parse::token;
  18 use str::char_at;
  19 use rustc_unicode::property::Pattern_White_Space;
  20
  21 use std::borrow::Cow;
  22 use std::char;
  23 use std::mem::replace;
  24 use std::rc::Rc;
  25
  26 pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag};
  27
  28 pub mod comments;
  29 mod unicode_chars;
  30
  31 pub trait Reader {
  32     fn is_eof(&self) -> bool;
  33     fn next_token(&mut self) -> TokenAndSpan;
  34     /// Report a fatal error with the current span.
  35     fn fatal(&self, &str) -> FatalError;
  36     /// Report a non-fatal error with the current span.
  37     fn err(&self, &str);
  38     fn peek(&self) -> TokenAndSpan;
  39     /// Get a token the parser cares about.
  40     fn real_token(&mut self) -> TokenAndSpan {
  41         let mut t = self.next_token();
  42         loop {
  43             match t.tok {
  44                 token::Whitespace | token::Comment | token::Shebang(_) => {
  45                     t = self.next_token();
  46                 }
  47                 _ => break,
  48             }
  49         }
  50         t
  51     }
  52 }
  53
  54 #[derive(Clone, PartialEq, Eq, Debug)]
  55 pub struct TokenAndSpan {
  56     pub tok: token::Token,
  57     pub sp: Span,
  58 }
  59
  60 pub struct StringReader<'a> {
  61     pub span_diagnostic: &'a Handler,
  62     /// The absolute offset within the codemap of the next character to read
  63     pub pos: BytePos,
  64     /// The absolute offset within the codemap of the last character read(curr)
  65     pub last_pos: BytePos,
  66     /// The column of the next character to read
  67     pub col: CharPos,
  68     /// The last character to be read
  69     pub curr: Option<char>,
  70     pub filemap: Rc<codemap::FileMap>,
  71     // cached:
  72     pub peek_tok: token::Token,
  73     pub peek_span: Span,
  74
  75     // cache a direct reference to the source text, so that we don't have to
  76     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
  77     source_text: Rc<String>,
  78 }
  79
  80 impl<'a> Reader for StringReader<'a> {
  81     fn is_eof(&self) -> bool {
  82         self.curr.is_none()
  83     }
  84     /// Return the next token. EFFECT: advances the string_reader.
  85     fn next_token(&mut self) -> TokenAndSpan {
  86         let ret_val = TokenAndSpan {
  87             tok: replace(&mut self.peek_tok, token::Underscore),
  88             sp: self.peek_span,
  89         };
  90         self.advance_token();
  91         ret_val
  92     }
  93     fn fatal(&self, m: &str) -> FatalError {
  94         self.fatal_span(self.peek_span, m)
  95     }
  96     fn err(&self, m: &str) {
  97         self.err_span(self.peek_span, m)
  98     }
  99     fn peek(&self) -> TokenAndSpan {
 100         // FIXME(pcwalton): Bad copy!
 101         TokenAndSpan {
 102             tok: self.peek_tok.clone(),
 103             sp: self.peek_span,
 104         }
 105     }
 106 }
 107
 108 impl<'a> Reader for TtReader<'a> {
 109     fn is_eof(&self) -> bool {
 110         self.cur_tok == token::Eof
 111     }
 112     fn next_token(&mut self) -> TokenAndSpan {
 113         let r = tt_next_token(self);
 114         debug!("TtReader: r={:?}", r);
 115         r
 116     }
 117     fn fatal(&self, m: &str) -> FatalError {
 118         self.sp_diag.span_fatal(self.cur_span, m)
 119     }
 120     fn err(&self, m: &str) {
 121         self.sp_diag.span_err(self.cur_span, m);
 122     }
 123     fn peek(&self) -> TokenAndSpan {
 124         TokenAndSpan {
 125             tok: self.cur_tok.clone(),
 126             sp: self.cur_span,
 127         }
 128     }
 129 }
 130
 131 impl<'a> StringReader<'a> {
 132     /// For comments.rs, which hackily pokes into pos and curr
 133     pub fn new_raw<'b>(span_diagnostic: &'b Handler,
 134                        filemap: Rc<codemap::FileMap>)
 135                        -> StringReader<'b> {
 136         if filemap.src.is_none() {
 137             span_diagnostic.bug(&format!("Cannot lex filemap \
 138                                           without source: {}",
 139                                          filemap.name)[..]);
 140         }
 141
 142         let source_text = (*filemap.src.as_ref().unwrap()).clone();
 143
 144         let mut sr = StringReader {
 145             span_diagnostic: span_diagnostic,
 146             pos: filemap.start_pos,
 147             last_pos: filemap.start_pos,
 148             col: CharPos(0),
 149             curr: Some('\n'),
 150             filemap: filemap,
 151             // dummy values; not read
 152             peek_tok: token::Eof,
 153             peek_span: codemap::DUMMY_SP,
 154             source_text: source_text,
 155         };
 156         sr.bump();
 157         sr
 158     }
 159
 160     pub fn new<'b>(span_diagnostic: &'b Handler,
 161                    filemap: Rc<codemap::FileMap>)
 162                    -> StringReader<'b> {
 163         let mut sr = StringReader::new_raw(span_diagnostic, filemap);
 164         sr.advance_token();
 165         sr
 166     }
 167
 168     pub fn curr_is(&self, c: char) -> bool {
 169         self.curr == Some(c)
 170     }
 171
 172     /// Report a fatal lexical error with a given span.
 173     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 174         self.span_diagnostic.span_fatal(sp, m)
 175     }
 176
 177     /// Report a lexical error with a given span.
 178     pub fn err_span(&self, sp: Span, m: &str) {
 179         self.span_diagnostic.span_err(sp, m)
 180     }
 181
 182
 183     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 184     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 185         self.fatal_span(codemap::mk_sp(from_pos, to_pos), m)
 186     }
 187
 188     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 189     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 190         self.err_span(codemap::mk_sp(from_pos, to_pos), m)
 191     }
 192
 193     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 194     /// escaped character to the error message
 195     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
 196         let mut m = m.to_string();
 197         m.push_str(": ");
 198         for c in c.escape_default() {
 199             m.push(c)
 200         }
 201         self.fatal_span_(from_pos, to_pos, &m[..])
 202     }
 203     fn struct_fatal_span_char(&self,
 204                               from_pos: BytePos,
 205                               to_pos: BytePos,
 206                               m: &str,
 207                               c: char)
 208                               -> DiagnosticBuilder<'a> {
 209         let mut m = m.to_string();
 210         m.push_str(": ");
 211         for c in c.escape_default() {
 212             m.push(c)
 213         }
 214         self.span_diagnostic.struct_span_fatal(codemap::mk_sp(from_pos, to_pos), &m[..])
 215     }
 216
 217     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 218     /// escaped character to the error message
 219     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
 220         let mut m = m.to_string();
 221         m.push_str(": ");
 222         for c in c.escape_default() {
 223             m.push(c)
 224         }
 225         self.err_span_(from_pos, to_pos, &m[..]);
 226     }
 227     fn struct_err_span_char(&self,
 228                             from_pos: BytePos,
 229                             to_pos: BytePos,
 230                             m: &str,
 231                             c: char)
 232                             -> DiagnosticBuilder<'a> {
 233         let mut m = m.to_string();
 234         m.push_str(": ");
 235         for c in c.escape_default() {
 236             m.push(c)
 237         }
 238         self.span_diagnostic.struct_span_err(codemap::mk_sp(from_pos, to_pos), &m[..])
 239     }
 240
 241     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
 242     /// offending string to the error message
 243     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
 244         m.push_str(": ");
 245         let from = self.byte_offset(from_pos).to_usize();
 246         let to = self.byte_offset(to_pos).to_usize();
 247         m.push_str(&self.source_text[from..to]);
 248         self.fatal_span_(from_pos, to_pos, &m[..])
 249     }
 250
 251     /// Advance peek_tok and peek_span to refer to the next token, and
 252     /// possibly update the interner.
 253     fn advance_token(&mut self) {
 254         match self.scan_whitespace_or_comment() {
 255             Some(comment) => {
 256                 self.peek_span = comment.sp;
 257                 self.peek_tok = comment.tok;
 258             }
 259             None => {
 260                 if self.is_eof() {
 261                     self.peek_tok = token::Eof;
 262                     self.peek_span = codemap::mk_sp(self.filemap.end_pos, self.filemap.end_pos);
 263                 } else {
 264                     let start_bytepos = self.last_pos;
 265                     self.peek_tok = self.next_token_inner();
 266                     self.peek_span = codemap::mk_sp(start_bytepos, self.last_pos);
 267                 };
 268             }
 269         }
 270     }
 271
 272     fn byte_offset(&self, pos: BytePos) -> BytePos {
 273         (pos - self.filemap.start_pos)
 274     }
 275
 276     /// Calls `f` with a string slice of the source text spanning from `start`
 277     /// up to but excluding `self.last_pos`, meaning the slice does not include
 278     /// the character `self.curr`.
 279     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
 280         where F: FnOnce(&str) -> T
 281     {
 282         self.with_str_from_to(start, self.last_pos, f)
 283     }
 284
 285     /// Create a Name from a given offset to the current offset, each
 286     /// adjusted 1 towards each other (assumes that on either side there is a
 287     /// single-byte delimiter).
 288     pub fn name_from(&self, start: BytePos) -> ast::Name {
 289         debug!("taking an ident from {:?} to {:?}", start, self.last_pos);
 290         self.with_str_from(start, token::intern)
 291     }
 292
 293     /// As name_from, with an explicit endpoint.
 294     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
 295         debug!("taking an ident from {:?} to {:?}", start, end);
 296         self.with_str_from_to(start, end, token::intern)
 297     }
 298
 299     /// Calls `f` with a string slice of the source text spanning from `start`
 300     /// up to but excluding `end`.
 301     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
 302         where F: FnOnce(&str) -> T
 303     {
 304         f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
 305     }
 306
 307     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 308     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 309         let mut i = 0;
 310         while i < s.len() {
 311             let ch = char_at(s, i);
 312             let next = i + ch.len_utf8();
 313             if ch == '\r' {
 314                 if next < s.len() && char_at(s, next) == '\n' {
 315                     return translate_crlf_(self, start, s, errmsg, i).into();
 316                 }
 317                 let pos = start + BytePos(i as u32);
 318                 let end_pos = start + BytePos(next as u32);
 319                 self.err_span_(pos, end_pos, errmsg);
 320             }
 321             i = next;
 322         }
 323         return s.into();
 324
 325         fn translate_crlf_(rdr: &StringReader,
 326                            start: BytePos,
 327                            s: &str,
 328                            errmsg: &str,
 329                            mut i: usize)
 330                            -> String {
 331             let mut buf = String::with_capacity(s.len());
 332             let mut j = 0;
 333             while i < s.len() {
 334                 let ch = char_at(s, i);
 335                 let next = i + ch.len_utf8();
 336                 if ch == '\r' {
 337                     if j < i {
 338                         buf.push_str(&s[j..i]);
 339                     }
 340                     j = next;
 341                     if next >= s.len() || char_at(s, next) != '\n' {
 342                         let pos = start + BytePos(i as u32);
 343                         let end_pos = start + BytePos(next as u32);
 344                         rdr.err_span_(pos, end_pos, errmsg);
 345                     }
 346                 }
 347                 i = next;
 348             }
 349             if j < s.len() {
 350                 buf.push_str(&s[j..]);
 351             }
 352             buf
 353         }
 354     }
 355
 356
 357     /// Advance the StringReader by one character. If a newline is
 358     /// discovered, add it to the FileMap's list of line start offsets.
 359     pub fn bump(&mut self) {
 360         self.last_pos = self.pos;
 361         let current_byte_offset = self.byte_offset(self.pos).to_usize();
 362         if current_byte_offset < self.source_text.len() {
 363             assert!(self.curr.is_some());
 364             let last_char = self.curr.unwrap();
 365             let ch = char_at(&self.source_text, current_byte_offset);
 366             let next = current_byte_offset + ch.len_utf8();
 367             let byte_offset_diff = next - current_byte_offset;
 368             self.pos = self.pos + Pos::from_usize(byte_offset_diff);
 369             self.curr = Some(ch);
 370             self.col = self.col + CharPos(1);
 371             if last_char == '\n' {
 372                 self.filemap.next_line(self.last_pos);
 373                 self.col = CharPos(0);
 374             }
 375
 376             if byte_offset_diff > 1 {
 377                 self.filemap.record_multibyte_char(self.last_pos, byte_offset_diff);
 378             }
 379         } else {
 380             self.curr = None;
 381         }
 382     }
 383
 384     pub fn nextch(&self) -> Option<char> {
 385         let offset = self.byte_offset(self.pos).to_usize();
 386         if offset < self.source_text.len() {
 387             Some(char_at(&self.source_text, offset))
 388         } else {
 389             None
 390         }
 391     }
 392
 393     pub fn nextch_is(&self, c: char) -> bool {
 394         self.nextch() == Some(c)
 395     }
 396
 397     pub fn nextnextch(&self) -> Option<char> {
 398         let offset = self.byte_offset(self.pos).to_usize();
 399         let s = &self.source_text[..];
 400         if offset >= s.len() {
 401             return None;
 402         }
 403         let next = offset + char_at(s, offset).len_utf8();
 404         if next < s.len() {
 405             Some(char_at(s, next))
 406         } else {
 407             None
 408         }
 409     }
 410
 411     pub fn nextnextch_is(&self, c: char) -> bool {
 412         self.nextnextch() == Some(c)
 413     }
 414
 415     /// Eats <XID_start><XID_continue>*, if possible.
 416     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
 417         if !ident_start(self.curr) {
 418             return None;
 419         }
 420         let start = self.last_pos;
 421         while ident_continue(self.curr) {
 422             self.bump();
 423         }
 424
 425         self.with_str_from(start, |string| {
 426             if string == "_" {
 427                 None
 428             } else {
 429                 Some(token::intern(string))
 430             }
 431         })
 432     }
 433
 434     /// PRECONDITION: self.curr is not whitespace
 435     /// Eats any kind of comment.
 436     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
 437         match self.curr {
 438             Some(c) => {
 439                 if c.is_whitespace() {
 440                     self.span_diagnostic.span_err(codemap::mk_sp(self.last_pos, self.last_pos),
 441                                                   "called consume_any_line_comment, but there \
 442                                                    was whitespace");
 443                 }
 444             }
 445             None => {}
 446         }
 447
 448         if self.curr_is('/') {
 449             match self.nextch() {
 450                 Some('/') => {
 451                     self.bump();
 452                     self.bump();
 453
 454                     // line comments starting with "///" or "//!" are doc-comments
 455                     let doc_comment = self.curr_is('/') || self.curr_is('!');
 456                     let start_bpos = if doc_comment {
 457                         self.pos - BytePos(3)
 458                     } else {
 459                         self.last_pos - BytePos(2)
 460                     };
 461
 462                     while !self.is_eof() {
 463                         match self.curr.unwrap() {
 464                             '\n' => break,
 465                             '\r' => {
 466                                 if self.nextch_is('\n') {
 467                                     // CRLF
 468                                     break;
 469                                 } else if doc_comment {
 470                                     self.err_span_(self.last_pos,
 471                                                    self.pos,
 472                                                    "bare CR not allowed in doc-comment");
 473                                 }
 474                             }
 475                             _ => (),
 476                         }
 477                         self.bump();
 478                     }
 479
 480                     return if doc_comment {
 481                         self.with_str_from(start_bpos, |string| {
 482                             // comments with only more "/"s are not doc comments
 483                             let tok = if is_doc_comment(string) {
 484                                 token::DocComment(token::intern(string))
 485                             } else {
 486                                 token::Comment
 487                             };
 488
 489                             Some(TokenAndSpan {
 490                                 tok: tok,
 491                                 sp: codemap::mk_sp(start_bpos, self.last_pos),
 492                             })
 493                         })
 494                     } else {
 495                         Some(TokenAndSpan {
 496                             tok: token::Comment,
 497                             sp: codemap::mk_sp(start_bpos, self.last_pos),
 498                         })
 499                     };
 500                 }
 501                 Some('*') => {
 502                     self.bump();
 503                     self.bump();
 504                     self.scan_block_comment()
 505                 }
 506                 _ => None,
 507             }
 508         } else if self.curr_is('#') {
 509             if self.nextch_is('!') {
 510
 511                 // Parse an inner attribute.
 512                 if self.nextnextch_is('[') {
 513                     return None;
 514                 }
 515
 516                 // I guess this is the only way to figure out if
 517                 // we're at the beginning of the file...
 518                 let cmap = CodeMap::new();
 519                 cmap.files.borrow_mut().push(self.filemap.clone());
 520                 let loc = cmap.lookup_char_pos_adj(self.last_pos);
 521                 debug!("Skipping a shebang");
 522                 if loc.line == 1 && loc.col == CharPos(0) {
 523                     // FIXME: Add shebang "token", return it
 524                     let start = self.last_pos;
 525                     while !self.curr_is('\n') && !self.is_eof() {
 526                         self.bump();
 527                     }
 528                     return Some(TokenAndSpan {
 529                         tok: token::Shebang(self.name_from(start)),
 530                         sp: codemap::mk_sp(start, self.last_pos),
 531                     });
 532                 }
 533             }
 534             None
 535         } else {
 536             None
 537         }
 538     }
 539
 540     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
 541     /// return None.
 542     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
 543         match self.curr.unwrap_or('\0') {
 544             // # to handle shebang at start of file -- this is the entry point
 545             // for skipping over all "junk"
 546             '/' | '#' => {
 547                 let c = self.scan_comment();
 548                 debug!("scanning a comment {:?}", c);
 549                 c
 550             },
 551             c if is_pattern_whitespace(Some(c)) => {
 552                 let start_bpos = self.last_pos;
 553                 while is_pattern_whitespace(self.curr) {
 554                     self.bump();
 555                 }
 556                 let c = Some(TokenAndSpan {
 557                     tok: token::Whitespace,
 558                     sp: codemap::mk_sp(start_bpos, self.last_pos),
 559                 });
 560                 debug!("scanning whitespace: {:?}", c);
 561                 c
 562             }
 563             _ => None,
 564         }
 565     }
 566
 567     /// Might return a sugared-doc-attr
 568     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
 569         // block comments starting with "/**" or "/*!" are doc-comments
 570         let is_doc_comment = self.curr_is('*') || self.curr_is('!');
 571         let start_bpos = self.last_pos - BytePos(2);
 572
 573         let mut level: isize = 1;
 574         let mut has_cr = false;
 575         while level > 0 {
 576             if self.is_eof() {
 577                 let msg = if is_doc_comment {
 578                     "unterminated block doc-comment"
 579                 } else {
 580                     "unterminated block comment"
 581                 };
 582                 let last_bpos = self.last_pos;
 583                 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
 584             }
 585             let n = self.curr.unwrap();
 586             match n {
 587                 '/' if self.nextch_is('*') => {
 588                     level += 1;
 589                     self.bump();
 590                 }
 591                 '*' if self.nextch_is('/') => {
 592                     level -= 1;
 593                     self.bump();
 594                 }
 595                 '\r' => {
 596                     has_cr = true;
 597                 }
 598                 _ => (),
 599             }
 600             self.bump();
 601         }
 602
 603         self.with_str_from(start_bpos, |string| {
 604             // but comments with only "*"s between two "/"s are not
 605             let tok = if is_block_doc_comment(string) {
 606                 let string = if has_cr {
 607                     self.translate_crlf(start_bpos,
 608                                         string,
 609                                         "bare CR not allowed in block doc-comment")
 610                 } else {
 611                     string.into()
 612                 };
 613                 token::DocComment(token::intern(&string[..]))
 614             } else {
 615                 token::Comment
 616             };
 617
 618             Some(TokenAndSpan {
 619                 tok: tok,
 620                 sp: codemap::mk_sp(start_bpos, self.last_pos),
 621             })
 622         })
 623     }
 624
 625     /// Scan through any digits (base `scan_radix`) or underscores,
 626     /// and return how many digits there were.
 627     ///
 628     /// `real_radix` represents the true radix of the number we're
 629     /// interested in, and errors will be emitted for any digits
 630     /// between `real_radix` and `scan_radix`.
 631     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
 632         assert!(real_radix <= scan_radix);
 633         let mut len = 0;
 634         loop {
 635             let c = self.curr;
 636             if c == Some('_') {
 637                 debug!("skipping a _");
 638                 self.bump();
 639                 continue;
 640             }
 641             match c.and_then(|cc| cc.to_digit(scan_radix)) {
 642                 Some(_) => {
 643                     debug!("{:?} in scan_digits", c);
 644                     // check that the hypothetical digit is actually
 645                     // in range for the true radix
 646                     if c.unwrap().to_digit(real_radix).is_none() {
 647                         self.err_span_(self.last_pos,
 648                                        self.pos,
 649                                        &format!("invalid digit for a base {} literal", real_radix));
 650                     }
 651                     len += 1;
 652                     self.bump();
 653                 }
 654                 _ => return len,
 655             }
 656         }
 657     }
 658
 659     /// Lex a LIT_INTEGER or a LIT_FLOAT
 660     fn scan_number(&mut self, c: char) -> token::Lit {
 661         let num_digits;
 662         let mut base = 10;
 663         let start_bpos = self.last_pos;
 664
 665         self.bump();
 666
 667         if c == '0' {
 668             match self.curr.unwrap_or('\0') {
 669                 'b' => {
 670                     self.bump();
 671                     base = 2;
 672                     num_digits = self.scan_digits(2, 10);
 673                 }
 674                 'o' => {
 675                     self.bump();
 676                     base = 8;
 677                     num_digits = self.scan_digits(8, 10);
 678                 }
 679                 'x' => {
 680                     self.bump();
 681                     base = 16;
 682                     num_digits = self.scan_digits(16, 16);
 683                 }
 684                 '0'...'9' | '_' | '.' => {
 685                     num_digits = self.scan_digits(10, 10) + 1;
 686                 }
 687                 _ => {
 688                     // just a 0
 689                     return token::Integer(self.name_from(start_bpos));
 690                 }
 691             }
 692         } else if c.is_digit(10) {
 693             num_digits = self.scan_digits(10, 10) + 1;
 694         } else {
 695             num_digits = 0;
 696         }
 697
 698         if num_digits == 0 {
 699             self.err_span_(start_bpos,
 700                            self.last_pos,
 701                            "no valid digits found for number");
 702             return token::Integer(token::intern("0"));
 703         }
 704
 705         // might be a float, but don't be greedy if this is actually an
 706         // integer literal followed by field/method access or a range pattern
 707         // (`0..2` and `12.foo()`)
 708         if self.curr_is('.') && !self.nextch_is('.') &&
 709            !self.nextch()
 710                 .unwrap_or('\0')
 711                 .is_xid_start() {
 712             // might have stuff after the ., and if it does, it needs to start
 713             // with a number
 714             self.bump();
 715             if self.curr.unwrap_or('\0').is_digit(10) {
 716                 self.scan_digits(10, 10);
 717                 self.scan_float_exponent();
 718             }
 719             let last_pos = self.last_pos;
 720             self.check_float_base(start_bpos, last_pos, base);
 721             return token::Float(self.name_from(start_bpos));
 722         } else {
 723             // it might be a float if it has an exponent
 724             if self.curr_is('e') || self.curr_is('E') {
 725                 self.scan_float_exponent();
 726                 let last_pos = self.last_pos;
 727                 self.check_float_base(start_bpos, last_pos, base);
 728                 return token::Float(self.name_from(start_bpos));
 729             }
 730             // but we certainly have an integer!
 731             return token::Integer(self.name_from(start_bpos));
 732         }
 733     }
 734
 735     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
 736     /// error if too many or too few digits are encountered.
 737     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
 738         debug!("scanning {} digits until {:?}", n_digits, delim);
 739         let start_bpos = self.last_pos;
 740         let mut accum_int = 0;
 741
 742         let mut valid = true;
 743         for _ in 0..n_digits {
 744             if self.is_eof() {
 745                 let last_bpos = self.last_pos;
 746                 panic!(self.fatal_span_(start_bpos,
 747                                         last_bpos,
 748                                         "unterminated numeric character escape"));
 749             }
 750             if self.curr_is(delim) {
 751                 let last_bpos = self.last_pos;
 752                 self.err_span_(start_bpos,
 753                                last_bpos,
 754                                "numeric character escape is too short");
 755                 valid = false;
 756                 break;
 757             }
 758             let c = self.curr.unwrap_or('\x00');
 759             accum_int *= 16;
 760             accum_int += c.to_digit(16).unwrap_or_else(|| {
 761                 self.err_span_char(self.last_pos,
 762                                    self.pos,
 763                                    "invalid character in numeric character escape",
 764                                    c);
 765
 766                 valid = false;
 767                 0
 768             });
 769             self.bump();
 770         }
 771
 772         if below_0x7f_only && accum_int >= 0x80 {
 773             self.err_span_(start_bpos,
 774                            self.last_pos,
 775                            "this form of character escape may only be used with characters in \
 776                             the range [\\x00-\\x7f]");
 777             valid = false;
 778         }
 779
 780         match char::from_u32(accum_int) {
 781             Some(_) => valid,
 782             None => {
 783                 let last_bpos = self.last_pos;
 784                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
 785                 false
 786             }
 787         }
 788     }
 789
 790     /// Scan for a single (possibly escaped) byte or char
 791     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
 792     /// `start` is the position of `first_source_char`, which is already consumed.
 793     ///
 794     /// Returns true if there was a valid char/byte, false otherwise.
 795     fn scan_char_or_byte(&mut self,
 796                          start: BytePos,
 797                          first_source_char: char,
 798                          ascii_only: bool,
 799                          delim: char)
 800                          -> bool {
 801         match first_source_char {
 802             '\\' => {
 803                 // '\X' for some X must be a character constant:
 804                 let escaped = self.curr;
 805                 let escaped_pos = self.last_pos;
 806                 self.bump();
 807                 match escaped {
 808                     None => {}  // EOF here is an error that will be checked later.
 809                     Some(e) => {
 810                         return match e {
 811                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
 812                             'x' => self.scan_byte_escape(delim, !ascii_only),
 813                             'u' => {
 814                                 let valid = if self.curr_is('{') {
 815                                     self.scan_unicode_escape(delim) && !ascii_only
 816                                 } else {
 817                                     let span = codemap::mk_sp(start, self.last_pos);
 818                                     self.span_diagnostic
 819                                         .struct_span_err(span, "incorrect unicode escape sequence")
 820                                         .span_help(span,
 821                                                    "format of unicode escape sequences is \
 822                                                     `\\u{…}`")
 823                                         .emit();
 824                                     false
 825                                 };
 826                                 if ascii_only {
 827                                     self.err_span_(start,
 828                                                    self.last_pos,
 829                                                    "unicode escape sequences cannot be used as a \
 830                                                     byte or in a byte string");
 831                                 }
 832                                 valid
 833
 834                             }
 835                             '\n' if delim == '"' => {
 836                                 self.consume_whitespace();
 837                                 true
 838                             }
 839                             '\r' if delim == '"' && self.curr_is('\n') => {
 840                                 self.consume_whitespace();
 841                                 true
 842                             }
 843                             c => {
 844                                 let last_pos = self.last_pos;
 845                                 let mut err = self.struct_err_span_char(escaped_pos,
 846                                                                         last_pos,
 847                                                                         if ascii_only {
 848                                                                             "unknown byte escape"
 849                                                                         } else {
 850                                                                             "unknown character \
 851                                                                              escape"
 852                                                                         },
 853                                                                         c);
 854                                 if e == '\r' {
 855                                     err.span_help(codemap::mk_sp(escaped_pos, last_pos),
 856                                                   "this is an isolated carriage return; consider \
 857                                                    checking your editor and version control \
 858                                                    settings");
 859                                 }
 860                                 if (e == '{' || e == '}') && !ascii_only {
 861                                     err.span_help(codemap::mk_sp(escaped_pos, last_pos),
 862                                                   "if used in a formatting string, curly braces \
 863                                                    are escaped with `{{` and `}}`");
 864                                 }
 865                                 err.emit();
 866                                 false
 867                             }
 868                         }
 869                     }
 870                 }
 871             }
 872             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
 873                 let last_pos = self.last_pos;
 874                 self.err_span_char(start,
 875                                    last_pos,
 876                                    if ascii_only {
 877                                        "byte constant must be escaped"
 878                                    } else {
 879                                        "character constant must be escaped"
 880                                    },
 881                                    first_source_char);
 882                 return false;
 883             }
 884             '\r' => {
 885                 if self.curr_is('\n') {
 886                     self.bump();
 887                     return true;
 888                 } else {
 889                     self.err_span_(start,
 890                                    self.last_pos,
 891                                    "bare CR not allowed in string, use \\r instead");
 892                     return false;
 893                 }
 894             }
 895             _ => {
 896                 if ascii_only && first_source_char > '\x7F' {
 897                     let last_pos = self.last_pos;
 898                     self.err_span_char(start,
 899                                        last_pos,
 900                                        "byte constant must be ASCII. Use a \\xHH escape for a \
 901                                         non-ASCII byte",
 902                                        first_source_char);
 903                     return false;
 904                 }
 905             }
 906         }
 907         true
 908     }
 909
 910     /// Scan over a \u{...} escape
 911     ///
 912     /// At this point, we have already seen the \ and the u, the { is the current character. We
 913     /// will read at least one digit, and up to 6, and pass over the }.
 914     fn scan_unicode_escape(&mut self, delim: char) -> bool {
 915         self.bump(); // past the {
 916         let start_bpos = self.last_pos;
 917         let mut count = 0;
 918         let mut accum_int = 0;
 919         let mut valid = true;
 920
 921         while !self.curr_is('}') && count <= 6 {
 922             let c = match self.curr {
 923                 Some(c) => c,
 924                 None => {
 925                     panic!(self.fatal_span_(start_bpos,
 926                                             self.last_pos,
 927                                             "unterminated unicode escape (found EOF)"));
 928                 }
 929             };
 930             accum_int *= 16;
 931             accum_int += c.to_digit(16).unwrap_or_else(|| {
 932                 if c == delim {
 933                     panic!(self.fatal_span_(self.last_pos,
 934                                             self.pos,
 935                                             "unterminated unicode escape (needed a `}`)"));
 936                 } else {
 937                     self.err_span_char(self.last_pos,
 938                                        self.pos,
 939                                        "invalid character in unicode escape",
 940                                        c);
 941                 }
 942                 valid = false;
 943                 0
 944             });
 945             self.bump();
 946             count += 1;
 947         }
 948
 949         if count > 6 {
 950             self.err_span_(start_bpos,
 951                            self.last_pos,
 952                            "overlong unicode escape (can have at most 6 hex digits)");
 953             valid = false;
 954         }
 955
 956         if valid && (char::from_u32(accum_int).is_none() || count == 0) {
 957             self.err_span_(start_bpos,
 958                            self.last_pos,
 959                            "invalid unicode character escape");
 960             valid = false;
 961         }
 962
 963         self.bump(); // past the ending }
 964         valid
 965     }
 966
 967     /// Scan over a float exponent.
 968     fn scan_float_exponent(&mut self) {
 969         if self.curr_is('e') || self.curr_is('E') {
 970             self.bump();
 971             if self.curr_is('-') || self.curr_is('+') {
 972                 self.bump();
 973             }
 974             if self.scan_digits(10, 10) == 0 {
 975                 self.err_span_(self.last_pos,
 976                                self.pos,
 977                                "expected at least one digit in exponent")
 978             }
 979         }
 980     }
 981
 982     /// Check that a base is valid for a floating literal, emitting a nice
 983     /// error if it isn't.
 984     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
 985         match base {
 986             16 => {
 987                 self.err_span_(start_bpos,
 988                                last_bpos,
 989                                "hexadecimal float literal is not supported")
 990             }
 991             8 => {
 992                 self.err_span_(start_bpos,
 993                                last_bpos,
 994                                "octal float literal is not supported")
 995             }
 996             2 => {
 997                 self.err_span_(start_bpos,
 998                                last_bpos,
 999                                "binary float literal is not supported")
1000             }
1001             _ => (),
1002         }
1003     }
1004
1005     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1006         self.bump();
1007         if self.curr_is('=') {
1008             self.bump();
1009             return token::BinOpEq(op);
1010         } else {
1011             return token::BinOp(op);
1012         }
1013     }
1014
1015     /// Return the next token from the string, advances the input past that
1016     /// token, and updates the interner
1017     fn next_token_inner(&mut self) -> token::Token {
1018         let c = self.curr;
1019         if ident_start(c) &&
1020            match (c.unwrap(), self.nextch(), self.nextnextch()) {
1021             // Note: r as in r" or r#" is part of a raw string literal,
1022             // b as in b' is part of a byte literal.
1023             // They are not identifiers, and are handled further down.
1024             ('r', Some('"'), _) |
1025             ('r', Some('#'), _) |
1026             ('b', Some('"'), _) |
1027             ('b', Some('\''), _) |
1028             ('b', Some('r'), Some('"')) |
1029             ('b', Some('r'), Some('#')) => false,
1030             _ => true,
1031         } {
1032             let start = self.last_pos;
1033             while ident_continue(self.curr) {
1034                 self.bump();
1035             }
1036
1037             return self.with_str_from(start, |string| {
1038                 if string == "_" {
1039                     token::Underscore
1040                 } else {
1041                     // FIXME: perform NFKC normalization here. (Issue #2253)
1042                     if self.curr_is(':') && self.nextch_is(':') {
1043                         token::Ident(str_to_ident(string), token::ModName)
1044                     } else {
1045                         token::Ident(str_to_ident(string), token::Plain)
1046                     }
1047                 }
1048             });
1049         }
1050
1051         if is_dec_digit(c) {
1052             let num = self.scan_number(c.unwrap());
1053             let suffix = self.scan_optional_raw_name();
1054             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1055             return token::Literal(num, suffix);
1056         }
1057
1058         match c.expect("next_token_inner called at EOF") {
1059             // One-byte tokens.
1060             ';' => {
1061                 self.bump();
1062                 return token::Semi;
1063             }
1064             ',' => {
1065                 self.bump();
1066                 return token::Comma;
1067             }
1068             '.' => {
1069                 self.bump();
1070                 return if self.curr_is('.') {
1071                     self.bump();
1072                     if self.curr_is('.') {
1073                         self.bump();
1074                         token::DotDotDot
1075                     } else {
1076                         token::DotDot
1077                     }
1078                 } else {
1079                     token::Dot
1080                 };
1081             }
1082             '(' => {
1083                 self.bump();
1084                 return token::OpenDelim(token::Paren);
1085             }
1086             ')' => {
1087                 self.bump();
1088                 return token::CloseDelim(token::Paren);
1089             }
1090             '{' => {
1091                 self.bump();
1092                 return token::OpenDelim(token::Brace);
1093             }
1094             '}' => {
1095                 self.bump();
1096                 return token::CloseDelim(token::Brace);
1097             }
1098             '[' => {
1099                 self.bump();
1100                 return token::OpenDelim(token::Bracket);
1101             }
1102             ']' => {
1103                 self.bump();
1104                 return token::CloseDelim(token::Bracket);
1105             }
1106             '@' => {
1107                 self.bump();
1108                 return token::At;
1109             }
1110             '#' => {
1111                 self.bump();
1112                 return token::Pound;
1113             }
1114             '~' => {
1115                 self.bump();
1116                 return token::Tilde;
1117             }
1118             '?' => {
1119                 self.bump();
1120                 return token::Question;
1121             }
1122             ':' => {
1123                 self.bump();
1124                 if self.curr_is(':') {
1125                     self.bump();
1126                     return token::ModSep;
1127                 } else {
1128                     return token::Colon;
1129                 }
1130             }
1131
1132             '$' => {
1133                 self.bump();
1134                 return token::Dollar;
1135             }
1136
1137             // Multi-byte tokens.
1138             '=' => {
1139                 self.bump();
1140                 if self.curr_is('=') {
1141                     self.bump();
1142                     return token::EqEq;
1143                 } else if self.curr_is('>') {
1144                     self.bump();
1145                     return token::FatArrow;
1146                 } else {
1147                     return token::Eq;
1148                 }
1149             }
1150             '!' => {
1151                 self.bump();
1152                 if self.curr_is('=') {
1153                     self.bump();
1154                     return token::Ne;
1155                 } else {
1156                     return token::Not;
1157                 }
1158             }
1159             '<' => {
1160                 self.bump();
1161                 match self.curr.unwrap_or('\x00') {
1162                     '=' => {
1163                         self.bump();
1164                         return token::Le;
1165                     }
1166                     '<' => {
1167                         return self.binop(token::Shl);
1168                     }
1169                     '-' => {
1170                         self.bump();
1171                         match self.curr.unwrap_or('\x00') {
1172                             _ => {
1173                                 return token::LArrow;
1174                             }
1175                         }
1176                     }
1177                     _ => {
1178                         return token::Lt;
1179                     }
1180                 }
1181             }
1182             '>' => {
1183                 self.bump();
1184                 match self.curr.unwrap_or('\x00') {
1185                     '=' => {
1186                         self.bump();
1187                         return token::Ge;
1188                     }
1189                     '>' => {
1190                         return self.binop(token::Shr);
1191                     }
1192                     _ => {
1193                         return token::Gt;
1194                     }
1195                 }
1196             }
1197             '\'' => {
1198                 // Either a character constant 'a' OR a lifetime name 'abc
1199                 let start_with_quote = self.last_pos;
1200                 self.bump();
1201                 let start = self.last_pos;
1202
1203                 // the eof will be picked up by the final `'` check below
1204                 let c2 = self.curr.unwrap_or('\x00');
1205                 self.bump();
1206
1207                 // If the character is an ident start not followed by another single
1208                 // quote, then this is a lifetime name:
1209                 if ident_start(Some(c2)) && !self.curr_is('\'') {
1210                     while ident_continue(self.curr) {
1211                         self.bump();
1212                     }
1213                     // lifetimes shouldn't end with a single quote
1214                     // if we find one, then this is an invalid character literal
1215                     if self.curr_is('\'') {
1216                         panic!(self.fatal_span_verbose(
1217                                start_with_quote, self.pos,
1218                                String::from("character literal may only contain one codepoint")));
1219
1220                     }
1221
1222                     // Include the leading `'` in the real identifier, for macro
1223                     // expansion purposes. See #12512 for the gory details of why
1224                     // this is necessary.
1225                     let ident = self.with_str_from(start, |lifetime_name| {
1226                         str_to_ident(&format!("'{}", lifetime_name))
1227                     });
1228
1229                     // Conjure up a "keyword checking ident" to make sure that
1230                     // the lifetime name is not a keyword.
1231                     let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1232                         str_to_ident(lifetime_name)
1233                     });
1234                     let keyword_checking_token = &token::Ident(keyword_checking_ident,
1235                                                                token::Plain);
1236                     let last_bpos = self.last_pos;
1237                     if keyword_checking_token.is_keyword(token::keywords::SelfValue) {
1238                         self.err_span_(start,
1239                                        last_bpos,
1240                                        "invalid lifetime name: 'self is no longer a special \
1241                                         lifetime");
1242                     } else if keyword_checking_token.is_any_keyword() &&
1243                        !keyword_checking_token.is_keyword(token::keywords::Static) {
1244                         self.err_span_(start, last_bpos, "invalid lifetime name");
1245                     }
1246
1247                     return token::Lifetime(ident);
1248                 }
1249
1250                 let valid = self.scan_char_or_byte(start,
1251                                                    c2,
1252                                                    // ascii_only =
1253                                                    false,
1254                                                    '\'');
1255
1256                 if !self.curr_is('\'') {
1257                     panic!(self.fatal_span_verbose(
1258                            start_with_quote, self.last_pos,
1259                            String::from("character literal may only contain one codepoint")));
1260                 }
1261
1262                 let id = if valid {
1263                     self.name_from(start)
1264                 } else {
1265                     token::intern("0")
1266                 };
1267                 self.bump(); // advance curr past token
1268                 let suffix = self.scan_optional_raw_name();
1269                 return token::Literal(token::Char(id), suffix);
1270             }
1271             'b' => {
1272                 self.bump();
1273                 let lit = match self.curr {
1274                     Some('\'') => self.scan_byte(),
1275                     Some('"') => self.scan_byte_string(),
1276                     Some('r') => self.scan_raw_byte_string(),
1277                     _ => unreachable!(),  // Should have been a token::Ident above.
1278                 };
1279                 let suffix = self.scan_optional_raw_name();
1280                 return token::Literal(lit, suffix);
1281             }
1282             '"' => {
1283                 let start_bpos = self.last_pos;
1284                 let mut valid = true;
1285                 self.bump();
1286                 while !self.curr_is('"') {
1287                     if self.is_eof() {
1288                         let last_bpos = self.last_pos;
1289                         panic!(self.fatal_span_(start_bpos,
1290                                                 last_bpos,
1291                                                 "unterminated double quote string"));
1292                     }
1293
1294                     let ch_start = self.last_pos;
1295                     let ch = self.curr.unwrap();
1296                     self.bump();
1297                     valid &= self.scan_char_or_byte(ch_start,
1298                                                     ch,
1299                                                     // ascii_only =
1300                                                     false,
1301                                                     '"');
1302                 }
1303                 // adjust for the ASCII " at the start of the literal
1304                 let id = if valid {
1305                     self.name_from(start_bpos + BytePos(1))
1306                 } else {
1307                     token::intern("??")
1308                 };
1309                 self.bump();
1310                 let suffix = self.scan_optional_raw_name();
1311                 return token::Literal(token::Str_(id), suffix);
1312             }
1313             'r' => {
1314                 let start_bpos = self.last_pos;
1315                 self.bump();
1316                 let mut hash_count = 0;
1317                 while self.curr_is('#') {
1318                     self.bump();
1319                     hash_count += 1;
1320                 }
1321
1322                 if self.is_eof() {
1323                     let last_bpos = self.last_pos;
1324                     panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1325                 } else if !self.curr_is('"') {
1326                     let last_bpos = self.last_pos;
1327                     let curr_char = self.curr.unwrap();
1328                     panic!(self.fatal_span_char(start_bpos,
1329                                                 last_bpos,
1330                                                 "found invalid character; only `#` is allowed \
1331                                                  in raw string delimitation",
1332                                                 curr_char));
1333                 }
1334                 self.bump();
1335                 let content_start_bpos = self.last_pos;
1336                 let mut content_end_bpos;
1337                 let mut valid = true;
1338                 'outer: loop {
1339                     if self.is_eof() {
1340                         let last_bpos = self.last_pos;
1341                         panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1342                     }
1343                     // if self.curr_is('"') {
1344                     // content_end_bpos = self.last_pos;
1345                     // for _ in 0..hash_count {
1346                     // self.bump();
1347                     // if !self.curr_is('#') {
1348                     // continue 'outer;
1349                     let c = self.curr.unwrap();
1350                     match c {
1351                         '"' => {
1352                             content_end_bpos = self.last_pos;
1353                             for _ in 0..hash_count {
1354                                 self.bump();
1355                                 if !self.curr_is('#') {
1356                                     continue 'outer;
1357                                 }
1358                             }
1359                             break;
1360                         }
1361                         '\r' => {
1362                             if !self.nextch_is('\n') {
1363                                 let last_bpos = self.last_pos;
1364                                 self.err_span_(start_bpos,
1365                                                last_bpos,
1366                                                "bare CR not allowed in raw string, use \\r \
1367                                                 instead");
1368                                 valid = false;
1369                             }
1370                         }
1371                         _ => (),
1372                     }
1373                     self.bump();
1374                 }
1375                 self.bump();
1376                 let id = if valid {
1377                     self.name_from_to(content_start_bpos, content_end_bpos)
1378                 } else {
1379                     token::intern("??")
1380                 };
1381                 let suffix = self.scan_optional_raw_name();
1382                 return token::Literal(token::StrRaw(id, hash_count), suffix);
1383             }
1384             '-' => {
1385                 if self.nextch_is('>') {
1386                     self.bump();
1387                     self.bump();
1388                     return token::RArrow;
1389                 } else {
1390                     return self.binop(token::Minus);
1391                 }
1392             }
1393             '&' => {
1394                 if self.nextch_is('&') {
1395                     self.bump();
1396                     self.bump();
1397                     return token::AndAnd;
1398                 } else {
1399                     return self.binop(token::And);
1400                 }
1401             }
1402             '|' => {
1403                 match self.nextch() {
1404                     Some('|') => {
1405                         self.bump();
1406                         self.bump();
1407                         return token::OrOr;
1408                     }
1409                     _ => {
1410                         return self.binop(token::Or);
1411                     }
1412                 }
1413             }
1414             '+' => {
1415                 return self.binop(token::Plus);
1416             }
1417             '*' => {
1418                 return self.binop(token::Star);
1419             }
1420             '/' => {
1421                 return self.binop(token::Slash);
1422             }
1423             '^' => {
1424                 return self.binop(token::Caret);
1425             }
1426             '%' => {
1427                 return self.binop(token::Percent);
1428             }
1429             c => {
1430                 let last_bpos = self.last_pos;
1431                 let bpos = self.pos;
1432                 let mut err = self.struct_fatal_span_char(last_bpos,
1433                                                           bpos,
1434                                                           "unknown start of token",
1435                                                           c);
1436                 unicode_chars::check_for_substitution(&self, c, &mut err);
1437                 err.emit();
1438                 panic!(FatalError);
1439             }
1440         }
1441     }
1442
1443     fn consume_whitespace(&mut self) {
1444         while is_pattern_whitespace(self.curr) && !self.is_eof() {
1445             self.bump();
1446         }
1447     }
1448
1449     fn read_to_eol(&mut self) -> String {
1450         let mut val = String::new();
1451         while !self.curr_is('\n') && !self.is_eof() {
1452             val.push(self.curr.unwrap());
1453             self.bump();
1454         }
1455         if self.curr_is('\n') {
1456             self.bump();
1457         }
1458         return val;
1459     }
1460
1461     fn read_one_line_comment(&mut self) -> String {
1462         let val = self.read_to_eol();
1463         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1464                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1465         return val;
1466     }
1467
1468     fn consume_non_eol_whitespace(&mut self) {
1469         while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
1470             self.bump();
1471         }
1472     }
1473
1474     fn peeking_at_comment(&self) -> bool {
1475         (self.curr_is('/') && self.nextch_is('/')) || (self.curr_is('/') && self.nextch_is('*')) ||
1476         // consider shebangs comments, but not inner attributes
1477         (self.curr_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1478     }
1479
1480     fn scan_byte(&mut self) -> token::Lit {
1481         self.bump();
1482         let start = self.last_pos;
1483
1484         // the eof will be picked up by the final `'` check below
1485         let c2 = self.curr.unwrap_or('\x00');
1486         self.bump();
1487
1488         let valid = self.scan_char_or_byte(start,
1489                                            c2,
1490                                            // ascii_only =
1491                                            true,
1492                                            '\'');
1493         if !self.curr_is('\'') {
1494             // Byte offsetting here is okay because the
1495             // character before position `start` are an
1496             // ascii single quote and ascii 'b'.
1497             let last_pos = self.last_pos;
1498             panic!(self.fatal_span_verbose(start - BytePos(2),
1499                                            last_pos,
1500                                            "unterminated byte constant".to_string()));
1501         }
1502
1503         let id = if valid {
1504             self.name_from(start)
1505         } else {
1506             token::intern("?")
1507         };
1508         self.bump(); // advance curr past token
1509         return token::Byte(id);
1510     }
1511
1512     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1513         self.scan_hex_digits(2, delim, below_0x7f_only)
1514     }
1515
1516     fn scan_byte_string(&mut self) -> token::Lit {
1517         self.bump();
1518         let start = self.last_pos;
1519         let mut valid = true;
1520
1521         while !self.curr_is('"') {
1522             if self.is_eof() {
1523                 let last_pos = self.last_pos;
1524                 panic!(self.fatal_span_(start, last_pos, "unterminated double quote byte string"));
1525             }
1526
1527             let ch_start = self.last_pos;
1528             let ch = self.curr.unwrap();
1529             self.bump();
1530             valid &= self.scan_char_or_byte(ch_start,
1531                                             ch,
1532                                             // ascii_only =
1533                                             true,
1534                                             '"');
1535         }
1536         let id = if valid {
1537             self.name_from(start)
1538         } else {
1539             token::intern("??")
1540         };
1541         self.bump();
1542         return token::ByteStr(id);
1543     }
1544
1545     fn scan_raw_byte_string(&mut self) -> token::Lit {
1546         let start_bpos = self.last_pos;
1547         self.bump();
1548         let mut hash_count = 0;
1549         while self.curr_is('#') {
1550             self.bump();
1551             hash_count += 1;
1552         }
1553
1554         if self.is_eof() {
1555             let last_pos = self.last_pos;
1556             panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string"));
1557         } else if !self.curr_is('"') {
1558             let last_pos = self.last_pos;
1559             let ch = self.curr.unwrap();
1560             panic!(self.fatal_span_char(start_bpos,
1561                                         last_pos,
1562                                         "found invalid character; only `#` is allowed in raw \
1563                                          string delimitation",
1564                                         ch));
1565         }
1566         self.bump();
1567         let content_start_bpos = self.last_pos;
1568         let mut content_end_bpos;
1569         'outer: loop {
1570             match self.curr {
1571                 None => {
1572                     let last_pos = self.last_pos;
1573                     panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string"))
1574                 }
1575                 Some('"') => {
1576                     content_end_bpos = self.last_pos;
1577                     for _ in 0..hash_count {
1578                         self.bump();
1579                         if !self.curr_is('#') {
1580                             continue 'outer;
1581                         }
1582                     }
1583                     break;
1584                 }
1585                 Some(c) => {
1586                     if c > '\x7F' {
1587                         let last_pos = self.last_pos;
1588                         self.err_span_char(last_pos, last_pos, "raw byte string must be ASCII", c);
1589                     }
1590                 }
1591             }
1592             self.bump();
1593         }
1594         self.bump();
1595         return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1596                                  hash_count);
1597     }
1598 }
1599
1600 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1601 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1602 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1603     c.map_or(false, Pattern_White_Space)
1604 }
1605
1606 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1607     match c {
1608         Some(c) => lo <= c && c <= hi,
1609         _ => false,
1610     }
1611 }
1612
1613 fn is_dec_digit(c: Option<char>) -> bool {
1614     return in_range(c, '0', '9');
1615 }
1616
1617 pub fn is_doc_comment(s: &str) -> bool {
1618     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1619               s.starts_with("//!");
1620     debug!("is {:?} a doc comment? {}", s, res);
1621     res
1622 }
1623
1624 pub fn is_block_doc_comment(s: &str) -> bool {
1625     // Prevent `/**/` from being parsed as a doc comment
1626     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1627                s.starts_with("/*!")) && s.len() >= 5;
1628     debug!("is {:?} a doc comment? {}", s, res);
1629     res
1630 }
1631
1632 fn ident_start(c: Option<char>) -> bool {
1633     let c = match c {
1634         Some(c) => c,
1635         None => return false,
1636     };
1637
1638     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1639 }
1640
1641 fn ident_continue(c: Option<char>) -> bool {
1642     let c = match c {
1643         Some(c) => c,
1644         None => return false,
1645     };
1646
1647     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1648     (c > '\x7f' && c.is_xid_continue())
1649 }
1650
1651 #[cfg(test)]
1652 mod tests {
1653     use super::*;
1654
1655     use codemap::{BytePos, CodeMap, Span, NO_EXPANSION};
1656     use errors;
1657     use parse::token;
1658     use parse::token::str_to_ident;
1659     use std::io;
1660     use std::rc::Rc;
1661
1662     fn mk_sh(cm: Rc<CodeMap>) -> errors::Handler {
1663         // FIXME (#22405): Replace `Box::new` with `box` here when/if possible.
1664         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), None, cm);
1665         errors::Handler::with_emitter(true, false, Box::new(emitter))
1666     }
1667
1668     // open a string reader for the given string
1669     fn setup<'a>(cm: &CodeMap,
1670                  span_handler: &'a errors::Handler,
1671                  teststr: String)
1672                  -> StringReader<'a> {
1673         let fm = cm.new_filemap("zebra.rs".to_string(), teststr);
1674         StringReader::new(span_handler, fm)
1675     }
1676
1677     #[test]
1678     fn t1() {
1679         let cm = Rc::new(CodeMap::new());
1680         let sh = mk_sh(cm.clone());
1681         let mut string_reader = setup(&cm,
1682                                       &sh,
1683                                       "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1684                                           .to_string());
1685         let id = str_to_ident("fn");
1686         assert_eq!(string_reader.next_token().tok, token::Comment);
1687         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1688         let tok1 = string_reader.next_token();
1689         let tok2 = TokenAndSpan {
1690             tok: token::Ident(id, token::Plain),
1691             sp: Span {
1692                 lo: BytePos(21),
1693                 hi: BytePos(23),
1694                 expn_id: NO_EXPANSION,
1695             },
1696         };
1697         assert_eq!(tok1, tok2);
1698         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1699         // the 'main' id is already read:
1700         assert_eq!(string_reader.last_pos.clone(), BytePos(28));
1701         // read another token:
1702         let tok3 = string_reader.next_token();
1703         let tok4 = TokenAndSpan {
1704             tok: token::Ident(str_to_ident("main"), token::Plain),
1705             sp: Span {
1706                 lo: BytePos(24),
1707                 hi: BytePos(28),
1708                 expn_id: NO_EXPANSION,
1709             },
1710         };
1711         assert_eq!(tok3, tok4);
1712         // the lparen is already read:
1713         assert_eq!(string_reader.last_pos.clone(), BytePos(29))
1714     }
1715
1716     // check that the given reader produces the desired stream
1717     // of tokens (stop checking after exhausting the expected vec)
1718     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1719         for expected_tok in &expected {
1720             assert_eq!(&string_reader.next_token().tok, expected_tok);
1721         }
1722     }
1723
1724     // make the identifier by looking up the string in the interner
1725     fn mk_ident(id: &str, style: token::IdentStyle) -> token::Token {
1726         token::Ident(str_to_ident(id), style)
1727     }
1728
1729     #[test]
1730     fn doublecolonparsing() {
1731         let cm = Rc::new(CodeMap::new());
1732         let sh = mk_sh(cm.clone());
1733         check_tokenization(setup(&cm, &sh, "a b".to_string()),
1734                            vec![mk_ident("a", token::Plain),
1735                                 token::Whitespace,
1736                                 mk_ident("b", token::Plain)]);
1737     }
1738
1739     #[test]
1740     fn dcparsing_2() {
1741         let cm = Rc::new(CodeMap::new());
1742         let sh = mk_sh(cm.clone());
1743         check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1744                            vec![mk_ident("a", token::ModName),
1745                                 token::ModSep,
1746                                 mk_ident("b", token::Plain)]);
1747     }
1748
1749     #[test]
1750     fn dcparsing_3() {
1751         let cm = Rc::new(CodeMap::new());
1752         let sh = mk_sh(cm.clone());
1753         check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1754                            vec![mk_ident("a", token::Plain),
1755                                 token::Whitespace,
1756                                 token::ModSep,
1757                                 mk_ident("b", token::Plain)]);
1758     }
1759
1760     #[test]
1761     fn dcparsing_4() {
1762         let cm = Rc::new(CodeMap::new());
1763         let sh = mk_sh(cm.clone());
1764         check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1765                            vec![mk_ident("a", token::ModName),
1766                                 token::ModSep,
1767                                 token::Whitespace,
1768                                 mk_ident("b", token::Plain)]);
1769     }
1770
1771     #[test]
1772     fn character_a() {
1773         let cm = Rc::new(CodeMap::new());
1774         let sh = mk_sh(cm.clone());
1775         assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1776                    token::Literal(token::Char(token::intern("a")), None));
1777     }
1778
1779     #[test]
1780     fn character_space() {
1781         let cm = Rc::new(CodeMap::new());
1782         let sh = mk_sh(cm.clone());
1783         assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1784                    token::Literal(token::Char(token::intern(" ")), None));
1785     }
1786
1787     #[test]
1788     fn character_escaped() {
1789         let cm = Rc::new(CodeMap::new());
1790         let sh = mk_sh(cm.clone());
1791         assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1792                    token::Literal(token::Char(token::intern("\\n")), None));
1793     }
1794
1795     #[test]
1796     fn lifetime_name() {
1797         let cm = Rc::new(CodeMap::new());
1798         let sh = mk_sh(cm.clone());
1799         assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1800                    token::Lifetime(token::str_to_ident("'abc")));
1801     }
1802
1803     #[test]
1804     fn raw_string() {
1805         let cm = Rc::new(CodeMap::new());
1806         let sh = mk_sh(cm.clone());
1807         assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1808                        .next_token()
1809                        .tok,
1810                    token::Literal(token::StrRaw(token::intern("\"#a\\b\x00c\""), 3), None));
1811     }
1812
1813     #[test]
1814     fn literal_suffixes() {
1815         let cm = Rc::new(CodeMap::new());
1816         let sh = mk_sh(cm.clone());
1817         macro_rules! test {
1818             ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1819                 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1820                            token::Literal(token::$tok_type(token::intern($tok_contents)),
1821                                           Some(token::intern("suffix"))));
1822                 // with a whitespace separator:
1823                 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1824                            token::Literal(token::$tok_type(token::intern($tok_contents)),
1825                                           None));
1826             }}
1827         }
1828
1829         test!("'a'", Char, "a");
1830         test!("b'a'", Byte, "a");
1831         test!("\"a\"", Str_, "a");
1832         test!("b\"a\"", ByteStr, "a");
1833         test!("1234", Integer, "1234");
1834         test!("0b101", Integer, "0b101");
1835         test!("0xABC", Integer, "0xABC");
1836         test!("1.0", Float, "1.0");
1837         test!("1.0e10", Float, "1.0e10");
1838
1839         assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1840                    token::Literal(token::Integer(token::intern("2")),
1841                                   Some(token::intern("us"))));
1842         assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1843                    token::Literal(token::StrRaw(token::intern("raw"), 3),
1844                                   Some(token::intern("suffix"))));
1845         assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1846                    token::Literal(token::ByteStrRaw(token::intern("raw"), 3),
1847                                   Some(token::intern("suffix"))));
1848     }
1849
1850     #[test]
1851     fn line_doc_comments() {
1852         assert!(is_doc_comment("///"));
1853         assert!(is_doc_comment("/// blah"));
1854         assert!(!is_doc_comment("////"));
1855     }
1856
1857     #[test]
1858     fn nested_block_comments() {
1859         let cm = Rc::new(CodeMap::new());
1860         let sh = mk_sh(cm.clone());
1861         let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1862         match lexer.next_token().tok {
1863             token::Comment => {}
1864             _ => panic!("expected a comment!"),
1865         }
1866         assert_eq!(lexer.next_token().tok,
1867                    token::Literal(token::Char(token::intern("a")), None));
1868     }
1869
1870     #[test]
1871     fn crlf_comments() {
1872         let cm = Rc::new(CodeMap::new());
1873         let sh = mk_sh(cm.clone());
1874         let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1875         let comment = lexer.next_token();
1876         assert_eq!(comment.tok, token::Comment);
1877         assert_eq!(comment.sp, ::codemap::mk_sp(BytePos(0), BytePos(7)));
1878         assert_eq!(lexer.next_token().tok, token::Whitespace);
1879         assert_eq!(lexer.next_token().tok,
1880                    token::DocComment(token::intern("/// test")));
1881     }
1882 }