]> git.proxmox.com Git - rustc.git/blob - src/libsyntax/parse/lexer/mod.rs
New upstream version 1.17.0+dfsg1
[rustc.git] / src / libsyntax / parse / lexer / mod.rs
1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use ast::{self, Ident};
12 use syntax_pos::{self, BytePos, CharPos, Pos, Span};
13 use codemap::CodeMap;
14 use errors::{FatalError, DiagnosticBuilder};
15 use parse::{token, ParseSess};
16 use str::char_at;
17 use symbol::{Symbol, keywords};
18 use std_unicode::property::Pattern_White_Space;
19
20 use std::borrow::Cow;
21 use std::char;
22 use std::mem::replace;
23 use std::rc::Rc;
24
25 pub mod comments;
26 mod tokentrees;
27 mod unicode_chars;
28
29 #[derive(Clone, PartialEq, Eq, Debug)]
30 pub struct TokenAndSpan {
31 pub tok: token::Token,
32 pub sp: Span,
33 }
34
35 impl Default for TokenAndSpan {
36 fn default() -> Self {
37 TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP }
38 }
39 }
40
41 pub struct StringReader<'a> {
42 pub sess: &'a ParseSess,
43 /// The absolute offset within the codemap of the next character to read
44 pub next_pos: BytePos,
45 /// The absolute offset within the codemap of the current character
46 pub pos: BytePos,
47 /// The column of the next character to read
48 pub col: CharPos,
49 /// The current character (which has been read from self.pos)
50 pub ch: Option<char>,
51 pub filemap: Rc<syntax_pos::FileMap>,
52 /// If Some, stop reading the source at this position (inclusive).
53 pub terminator: Option<BytePos>,
54 /// Whether to record new-lines and multibyte chars in filemap.
55 /// This is only necessary the first time a filemap is lexed.
56 /// If part of a filemap is being re-lexed, this should be set to false.
57 pub save_new_lines_and_multibyte: bool,
58 // cached:
59 pub peek_tok: token::Token,
60 pub peek_span: Span,
61 pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
62 // cache a direct reference to the source text, so that we don't have to
63 // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
64 source_text: Rc<String>,
65 /// Stack of open delimiters and their spans. Used for error message.
66 token: token::Token,
67 span: Span,
68 open_braces: Vec<(token::DelimToken, Span)>,
69 }
70
71 impl<'a> StringReader<'a> {
72 fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
73 let res = self.try_next_token();
74 self.unwrap_or_abort(res)
75 }
76 fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
77 match res {
78 Ok(tok) => tok,
79 Err(_) => {
80 self.emit_fatal_errors();
81 panic!(FatalError);
82 }
83 }
84 }
85 fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
86 let mut t = self.try_next_token()?;
87 loop {
88 match t.tok {
89 token::Whitespace | token::Comment | token::Shebang(_) => {
90 t = self.try_next_token()?;
91 }
92 _ => break,
93 }
94 }
95 self.token = t.tok.clone();
96 self.span = t.sp;
97 Ok(t)
98 }
99 pub fn real_token(&mut self) -> TokenAndSpan {
100 let res = self.try_real_token();
101 self.unwrap_or_abort(res)
102 }
103 fn is_eof(&self) -> bool {
104 if self.ch.is_none() {
105 return true;
106 }
107
108 match self.terminator {
109 Some(t) => self.next_pos > t,
110 None => false,
111 }
112 }
113 /// Return the next token. EFFECT: advances the string_reader.
114 pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
115 assert!(self.fatal_errs.is_empty());
116 let ret_val = TokenAndSpan {
117 tok: replace(&mut self.peek_tok, token::Underscore),
118 sp: self.peek_span,
119 };
120 self.advance_token()?;
121 Ok(ret_val)
122 }
123 fn fatal(&self, m: &str) -> FatalError {
124 self.fatal_span(self.peek_span, m)
125 }
126 pub fn emit_fatal_errors(&mut self) {
127 for err in &mut self.fatal_errs {
128 err.emit();
129 }
130 self.fatal_errs.clear();
131 }
132 pub fn peek(&self) -> TokenAndSpan {
133 // FIXME(pcwalton): Bad copy!
134 TokenAndSpan {
135 tok: self.peek_tok.clone(),
136 sp: self.peek_span,
137 }
138 }
139 }
140
141 impl<'a> StringReader<'a> {
142 /// For comments.rs, which hackily pokes into next_pos and ch
143 pub fn new_raw<'b>(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
144 let mut sr = StringReader::new_raw_internal(sess, filemap);
145 sr.bump();
146 sr
147 }
148
149 fn new_raw_internal(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
150 if filemap.src.is_none() {
151 sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
152 filemap.name));
153 }
154
155 let source_text = (*filemap.src.as_ref().unwrap()).clone();
156
157 StringReader {
158 sess: sess,
159 next_pos: filemap.start_pos,
160 pos: filemap.start_pos,
161 col: CharPos(0),
162 ch: Some('\n'),
163 filemap: filemap,
164 terminator: None,
165 save_new_lines_and_multibyte: true,
166 // dummy values; not read
167 peek_tok: token::Eof,
168 peek_span: syntax_pos::DUMMY_SP,
169 source_text: source_text,
170 fatal_errs: Vec::new(),
171 token: token::Eof,
172 span: syntax_pos::DUMMY_SP,
173 open_braces: Vec::new(),
174 }
175 }
176
177 pub fn new(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
178 let mut sr = StringReader::new_raw(sess, filemap);
179 if let Err(_) = sr.advance_token() {
180 sr.emit_fatal_errors();
181 panic!(FatalError);
182 }
183 sr
184 }
185
186 pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
187 let begin = sess.codemap().lookup_byte_offset(span.lo);
188 let end = sess.codemap().lookup_byte_offset(span.hi);
189
190 // Make the range zero-length if the span is invalid.
191 if span.lo > span.hi || begin.fm.start_pos != end.fm.start_pos {
192 span.hi = span.lo;
193 }
194
195 let mut sr = StringReader::new_raw_internal(sess, begin.fm);
196
197 // Seek the lexer to the right byte range.
198 sr.save_new_lines_and_multibyte = false;
199 sr.next_pos = span.lo;
200 sr.terminator = Some(span.hi);
201
202 sr.bump();
203
204 if let Err(_) = sr.advance_token() {
205 sr.emit_fatal_errors();
206 panic!(FatalError);
207 }
208 sr
209 }
210
211 pub fn ch_is(&self, c: char) -> bool {
212 self.ch == Some(c)
213 }
214
215 /// Report a fatal lexical error with a given span.
216 pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
217 self.sess.span_diagnostic.span_fatal(sp, m)
218 }
219
220 /// Report a lexical error with a given span.
221 pub fn err_span(&self, sp: Span, m: &str) {
222 self.sess.span_diagnostic.span_err(sp, m)
223 }
224
225
226 /// Report a fatal error spanning [`from_pos`, `to_pos`).
227 fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
228 self.fatal_span(syntax_pos::mk_sp(from_pos, to_pos), m)
229 }
230
231 /// Report a lexical error spanning [`from_pos`, `to_pos`).
232 fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
233 self.err_span(syntax_pos::mk_sp(from_pos, to_pos), m)
234 }
235
236 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
237 /// escaped character to the error message
238 fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
239 let mut m = m.to_string();
240 m.push_str(": ");
241 for c in c.escape_default() {
242 m.push(c)
243 }
244 self.fatal_span_(from_pos, to_pos, &m[..])
245 }
246 fn struct_fatal_span_char(&self,
247 from_pos: BytePos,
248 to_pos: BytePos,
249 m: &str,
250 c: char)
251 -> DiagnosticBuilder<'a> {
252 let mut m = m.to_string();
253 m.push_str(": ");
254 for c in c.escape_default() {
255 m.push(c)
256 }
257 self.sess.span_diagnostic.struct_span_fatal(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
258 }
259
260 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
261 /// escaped character to the error message
262 fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
263 let mut m = m.to_string();
264 m.push_str(": ");
265 for c in c.escape_default() {
266 m.push(c)
267 }
268 self.err_span_(from_pos, to_pos, &m[..]);
269 }
270 fn struct_err_span_char(&self,
271 from_pos: BytePos,
272 to_pos: BytePos,
273 m: &str,
274 c: char)
275 -> DiagnosticBuilder<'a> {
276 let mut m = m.to_string();
277 m.push_str(": ");
278 for c in c.escape_default() {
279 m.push(c)
280 }
281 self.sess.span_diagnostic.struct_span_err(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
282 }
283
284 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
285 /// offending string to the error message
286 fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
287 m.push_str(": ");
288 let from = self.byte_offset(from_pos).to_usize();
289 let to = self.byte_offset(to_pos).to_usize();
290 m.push_str(&self.source_text[from..to]);
291 self.fatal_span_(from_pos, to_pos, &m[..])
292 }
293
294 /// Advance peek_tok and peek_span to refer to the next token, and
295 /// possibly update the interner.
296 fn advance_token(&mut self) -> Result<(), ()> {
297 match self.scan_whitespace_or_comment() {
298 Some(comment) => {
299 self.peek_span = comment.sp;
300 self.peek_tok = comment.tok;
301 }
302 None => {
303 if self.is_eof() {
304 self.peek_tok = token::Eof;
305 self.peek_span = syntax_pos::mk_sp(self.filemap.end_pos, self.filemap.end_pos);
306 } else {
307 let start_bytepos = self.pos;
308 self.peek_tok = self.next_token_inner()?;
309 self.peek_span = syntax_pos::mk_sp(start_bytepos, self.pos);
310 };
311 }
312 }
313 Ok(())
314 }
315
316 fn byte_offset(&self, pos: BytePos) -> BytePos {
317 (pos - self.filemap.start_pos)
318 }
319
320 /// Calls `f` with a string slice of the source text spanning from `start`
321 /// up to but excluding `self.pos`, meaning the slice does not include
322 /// the character `self.ch`.
323 pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
324 where F: FnOnce(&str) -> T
325 {
326 self.with_str_from_to(start, self.pos, f)
327 }
328
329 /// Create a Name from a given offset to the current offset, each
330 /// adjusted 1 towards each other (assumes that on either side there is a
331 /// single-byte delimiter).
332 pub fn name_from(&self, start: BytePos) -> ast::Name {
333 debug!("taking an ident from {:?} to {:?}", start, self.pos);
334 self.with_str_from(start, Symbol::intern)
335 }
336
337 /// As name_from, with an explicit endpoint.
338 pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
339 debug!("taking an ident from {:?} to {:?}", start, end);
340 self.with_str_from_to(start, end, Symbol::intern)
341 }
342
343 /// Calls `f` with a string slice of the source text spanning from `start`
344 /// up to but excluding `end`.
345 fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
346 where F: FnOnce(&str) -> T
347 {
348 f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
349 }
350
351 /// Converts CRLF to LF in the given string, raising an error on bare CR.
352 fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
353 let mut i = 0;
354 while i < s.len() {
355 let ch = char_at(s, i);
356 let next = i + ch.len_utf8();
357 if ch == '\r' {
358 if next < s.len() && char_at(s, next) == '\n' {
359 return translate_crlf_(self, start, s, errmsg, i).into();
360 }
361 let pos = start + BytePos(i as u32);
362 let end_pos = start + BytePos(next as u32);
363 self.err_span_(pos, end_pos, errmsg);
364 }
365 i = next;
366 }
367 return s.into();
368
369 fn translate_crlf_(rdr: &StringReader,
370 start: BytePos,
371 s: &str,
372 errmsg: &str,
373 mut i: usize)
374 -> String {
375 let mut buf = String::with_capacity(s.len());
376 let mut j = 0;
377 while i < s.len() {
378 let ch = char_at(s, i);
379 let next = i + ch.len_utf8();
380 if ch == '\r' {
381 if j < i {
382 buf.push_str(&s[j..i]);
383 }
384 j = next;
385 if next >= s.len() || char_at(s, next) != '\n' {
386 let pos = start + BytePos(i as u32);
387 let end_pos = start + BytePos(next as u32);
388 rdr.err_span_(pos, end_pos, errmsg);
389 }
390 }
391 i = next;
392 }
393 if j < s.len() {
394 buf.push_str(&s[j..]);
395 }
396 buf
397 }
398 }
399
400
401 /// Advance the StringReader by one character. If a newline is
402 /// discovered, add it to the FileMap's list of line start offsets.
403 pub fn bump(&mut self) {
404 let new_pos = self.next_pos;
405 let new_byte_offset = self.byte_offset(new_pos).to_usize();
406 let end = self.terminator.map_or(self.source_text.len(), |t| {
407 self.byte_offset(t).to_usize()
408 });
409 if new_byte_offset < end {
410 let old_ch_is_newline = self.ch.unwrap() == '\n';
411 let new_ch = char_at(&self.source_text, new_byte_offset);
412 let new_ch_len = new_ch.len_utf8();
413
414 self.ch = Some(new_ch);
415 self.pos = new_pos;
416 self.next_pos = new_pos + Pos::from_usize(new_ch_len);
417 if old_ch_is_newline {
418 if self.save_new_lines_and_multibyte {
419 self.filemap.next_line(self.pos);
420 }
421 self.col = CharPos(0);
422 } else {
423 self.col = self.col + CharPos(1);
424 }
425 if new_ch_len > 1 {
426 if self.save_new_lines_and_multibyte {
427 self.filemap.record_multibyte_char(self.pos, new_ch_len);
428 }
429 }
430 } else {
431 self.ch = None;
432 self.pos = new_pos;
433 }
434 }
435
436 pub fn nextch(&self) -> Option<char> {
437 let offset = self.byte_offset(self.next_pos).to_usize();
438 if offset < self.source_text.len() {
439 Some(char_at(&self.source_text, offset))
440 } else {
441 None
442 }
443 }
444
445 pub fn nextch_is(&self, c: char) -> bool {
446 self.nextch() == Some(c)
447 }
448
449 pub fn nextnextch(&self) -> Option<char> {
450 let offset = self.byte_offset(self.next_pos).to_usize();
451 let s = &self.source_text[..];
452 if offset >= s.len() {
453 return None;
454 }
455 let next = offset + char_at(s, offset).len_utf8();
456 if next < s.len() {
457 Some(char_at(s, next))
458 } else {
459 None
460 }
461 }
462
463 pub fn nextnextch_is(&self, c: char) -> bool {
464 self.nextnextch() == Some(c)
465 }
466
467 /// Eats <XID_start><XID_continue>*, if possible.
468 fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
469 if !ident_start(self.ch) {
470 return None;
471 }
472 let start = self.pos;
473 while ident_continue(self.ch) {
474 self.bump();
475 }
476
477 self.with_str_from(start, |string| {
478 if string == "_" {
479 None
480 } else {
481 Some(Symbol::intern(string))
482 }
483 })
484 }
485
486 /// PRECONDITION: self.ch is not whitespace
487 /// Eats any kind of comment.
488 fn scan_comment(&mut self) -> Option<TokenAndSpan> {
489 if let Some(c) = self.ch {
490 if c.is_whitespace() {
491 let msg = "called consume_any_line_comment, but there was whitespace";
492 self.sess.span_diagnostic.span_err(syntax_pos::mk_sp(self.pos, self.pos), msg);
493 }
494 }
495
496 if self.ch_is('/') {
497 match self.nextch() {
498 Some('/') => {
499 self.bump();
500 self.bump();
501
502 // line comments starting with "///" or "//!" are doc-comments
503 let doc_comment = self.ch_is('/') || self.ch_is('!');
504 let start_bpos = self.pos - BytePos(2);
505
506 while !self.is_eof() {
507 match self.ch.unwrap() {
508 '\n' => break,
509 '\r' => {
510 if self.nextch_is('\n') {
511 // CRLF
512 break;
513 } else if doc_comment {
514 self.err_span_(self.pos,
515 self.next_pos,
516 "bare CR not allowed in doc-comment");
517 }
518 }
519 _ => (),
520 }
521 self.bump();
522 }
523
524 return if doc_comment {
525 self.with_str_from(start_bpos, |string| {
526 // comments with only more "/"s are not doc comments
527 let tok = if is_doc_comment(string) {
528 token::DocComment(Symbol::intern(string))
529 } else {
530 token::Comment
531 };
532
533 Some(TokenAndSpan {
534 tok: tok,
535 sp: syntax_pos::mk_sp(start_bpos, self.pos),
536 })
537 })
538 } else {
539 Some(TokenAndSpan {
540 tok: token::Comment,
541 sp: syntax_pos::mk_sp(start_bpos, self.pos),
542 })
543 };
544 }
545 Some('*') => {
546 self.bump();
547 self.bump();
548 self.scan_block_comment()
549 }
550 _ => None,
551 }
552 } else if self.ch_is('#') {
553 if self.nextch_is('!') {
554
555 // Parse an inner attribute.
556 if self.nextnextch_is('[') {
557 return None;
558 }
559
560 // I guess this is the only way to figure out if
561 // we're at the beginning of the file...
562 let cmap = CodeMap::new();
563 cmap.files.borrow_mut().push(self.filemap.clone());
564 let loc = cmap.lookup_char_pos_adj(self.pos);
565 debug!("Skipping a shebang");
566 if loc.line == 1 && loc.col == CharPos(0) {
567 // FIXME: Add shebang "token", return it
568 let start = self.pos;
569 while !self.ch_is('\n') && !self.is_eof() {
570 self.bump();
571 }
572 return Some(TokenAndSpan {
573 tok: token::Shebang(self.name_from(start)),
574 sp: syntax_pos::mk_sp(start, self.pos),
575 });
576 }
577 }
578 None
579 } else {
580 None
581 }
582 }
583
584 /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
585 /// return None.
586 fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
587 match self.ch.unwrap_or('\0') {
588 // # to handle shebang at start of file -- this is the entry point
589 // for skipping over all "junk"
590 '/' | '#' => {
591 let c = self.scan_comment();
592 debug!("scanning a comment {:?}", c);
593 c
594 },
595 c if is_pattern_whitespace(Some(c)) => {
596 let start_bpos = self.pos;
597 while is_pattern_whitespace(self.ch) {
598 self.bump();
599 }
600 let c = Some(TokenAndSpan {
601 tok: token::Whitespace,
602 sp: syntax_pos::mk_sp(start_bpos, self.pos),
603 });
604 debug!("scanning whitespace: {:?}", c);
605 c
606 }
607 _ => None,
608 }
609 }
610
611 /// Might return a sugared-doc-attr
612 fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
613 // block comments starting with "/**" or "/*!" are doc-comments
614 let is_doc_comment = self.ch_is('*') || self.ch_is('!');
615 let start_bpos = self.pos - BytePos(2);
616
617 let mut level: isize = 1;
618 let mut has_cr = false;
619 while level > 0 {
620 if self.is_eof() {
621 let msg = if is_doc_comment {
622 "unterminated block doc-comment"
623 } else {
624 "unterminated block comment"
625 };
626 let last_bpos = self.pos;
627 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
628 }
629 let n = self.ch.unwrap();
630 match n {
631 '/' if self.nextch_is('*') => {
632 level += 1;
633 self.bump();
634 }
635 '*' if self.nextch_is('/') => {
636 level -= 1;
637 self.bump();
638 }
639 '\r' => {
640 has_cr = true;
641 }
642 _ => (),
643 }
644 self.bump();
645 }
646
647 self.with_str_from(start_bpos, |string| {
648 // but comments with only "*"s between two "/"s are not
649 let tok = if is_block_doc_comment(string) {
650 let string = if has_cr {
651 self.translate_crlf(start_bpos,
652 string,
653 "bare CR not allowed in block doc-comment")
654 } else {
655 string.into()
656 };
657 token::DocComment(Symbol::intern(&string[..]))
658 } else {
659 token::Comment
660 };
661
662 Some(TokenAndSpan {
663 tok: tok,
664 sp: syntax_pos::mk_sp(start_bpos, self.pos),
665 })
666 })
667 }
668
669 /// Scan through any digits (base `scan_radix`) or underscores,
670 /// and return how many digits there were.
671 ///
672 /// `real_radix` represents the true radix of the number we're
673 /// interested in, and errors will be emitted for any digits
674 /// between `real_radix` and `scan_radix`.
675 fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
676 assert!(real_radix <= scan_radix);
677 let mut len = 0;
678 loop {
679 let c = self.ch;
680 if c == Some('_') {
681 debug!("skipping a _");
682 self.bump();
683 continue;
684 }
685 match c.and_then(|cc| cc.to_digit(scan_radix)) {
686 Some(_) => {
687 debug!("{:?} in scan_digits", c);
688 // check that the hypothetical digit is actually
689 // in range for the true radix
690 if c.unwrap().to_digit(real_radix).is_none() {
691 self.err_span_(self.pos,
692 self.next_pos,
693 &format!("invalid digit for a base {} literal", real_radix));
694 }
695 len += 1;
696 self.bump();
697 }
698 _ => return len,
699 }
700 }
701 }
702
703 /// Lex a LIT_INTEGER or a LIT_FLOAT
704 fn scan_number(&mut self, c: char) -> token::Lit {
705 let num_digits;
706 let mut base = 10;
707 let start_bpos = self.pos;
708
709 self.bump();
710
711 if c == '0' {
712 match self.ch.unwrap_or('\0') {
713 'b' => {
714 self.bump();
715 base = 2;
716 num_digits = self.scan_digits(2, 10);
717 }
718 'o' => {
719 self.bump();
720 base = 8;
721 num_digits = self.scan_digits(8, 10);
722 }
723 'x' => {
724 self.bump();
725 base = 16;
726 num_digits = self.scan_digits(16, 16);
727 }
728 '0'...'9' | '_' | '.' => {
729 num_digits = self.scan_digits(10, 10) + 1;
730 }
731 _ => {
732 // just a 0
733 return token::Integer(self.name_from(start_bpos));
734 }
735 }
736 } else if c.is_digit(10) {
737 num_digits = self.scan_digits(10, 10) + 1;
738 } else {
739 num_digits = 0;
740 }
741
742 if num_digits == 0 {
743 self.err_span_(start_bpos,
744 self.pos,
745 "no valid digits found for number");
746 return token::Integer(Symbol::intern("0"));
747 }
748
749 // might be a float, but don't be greedy if this is actually an
750 // integer literal followed by field/method access or a range pattern
751 // (`0..2` and `12.foo()`)
752 if self.ch_is('.') && !self.nextch_is('.') &&
753 !self.nextch()
754 .unwrap_or('\0')
755 .is_xid_start() {
756 // might have stuff after the ., and if it does, it needs to start
757 // with a number
758 self.bump();
759 if self.ch.unwrap_or('\0').is_digit(10) {
760 self.scan_digits(10, 10);
761 self.scan_float_exponent();
762 }
763 let pos = self.pos;
764 self.check_float_base(start_bpos, pos, base);
765 return token::Float(self.name_from(start_bpos));
766 } else {
767 // it might be a float if it has an exponent
768 if self.ch_is('e') || self.ch_is('E') {
769 self.scan_float_exponent();
770 let pos = self.pos;
771 self.check_float_base(start_bpos, pos, base);
772 return token::Float(self.name_from(start_bpos));
773 }
774 // but we certainly have an integer!
775 return token::Integer(self.name_from(start_bpos));
776 }
777 }
778
779 /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
780 /// error if too many or too few digits are encountered.
781 fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
782 debug!("scanning {} digits until {:?}", n_digits, delim);
783 let start_bpos = self.pos;
784 let mut accum_int = 0;
785
786 let mut valid = true;
787 for _ in 0..n_digits {
788 if self.is_eof() {
789 let last_bpos = self.pos;
790 panic!(self.fatal_span_(start_bpos,
791 last_bpos,
792 "unterminated numeric character escape"));
793 }
794 if self.ch_is(delim) {
795 let last_bpos = self.pos;
796 self.err_span_(start_bpos,
797 last_bpos,
798 "numeric character escape is too short");
799 valid = false;
800 break;
801 }
802 let c = self.ch.unwrap_or('\x00');
803 accum_int *= 16;
804 accum_int += c.to_digit(16).unwrap_or_else(|| {
805 self.err_span_char(self.pos,
806 self.next_pos,
807 "invalid character in numeric character escape",
808 c);
809
810 valid = false;
811 0
812 });
813 self.bump();
814 }
815
816 if below_0x7f_only && accum_int >= 0x80 {
817 self.err_span_(start_bpos,
818 self.pos,
819 "this form of character escape may only be used with characters in \
820 the range [\\x00-\\x7f]");
821 valid = false;
822 }
823
824 match char::from_u32(accum_int) {
825 Some(_) => valid,
826 None => {
827 let last_bpos = self.pos;
828 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
829 false
830 }
831 }
832 }
833
834 /// Scan for a single (possibly escaped) byte or char
835 /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
836 /// `start` is the position of `first_source_char`, which is already consumed.
837 ///
838 /// Returns true if there was a valid char/byte, false otherwise.
839 fn scan_char_or_byte(&mut self,
840 start: BytePos,
841 first_source_char: char,
842 ascii_only: bool,
843 delim: char)
844 -> bool {
845 match first_source_char {
846 '\\' => {
847 // '\X' for some X must be a character constant:
848 let escaped = self.ch;
849 let escaped_pos = self.pos;
850 self.bump();
851 match escaped {
852 None => {} // EOF here is an error that will be checked later.
853 Some(e) => {
854 return match e {
855 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
856 'x' => self.scan_byte_escape(delim, !ascii_only),
857 'u' => {
858 let valid = if self.ch_is('{') {
859 self.scan_unicode_escape(delim) && !ascii_only
860 } else {
861 let span = syntax_pos::mk_sp(start, self.pos);
862 self.sess.span_diagnostic
863 .struct_span_err(span, "incorrect unicode escape sequence")
864 .span_help(span,
865 "format of unicode escape sequences is \
866 `\\u{…}`")
867 .emit();
868 false
869 };
870 if ascii_only {
871 self.err_span_(start,
872 self.pos,
873 "unicode escape sequences cannot be used as a \
874 byte or in a byte string");
875 }
876 valid
877
878 }
879 '\n' if delim == '"' => {
880 self.consume_whitespace();
881 true
882 }
883 '\r' if delim == '"' && self.ch_is('\n') => {
884 self.consume_whitespace();
885 true
886 }
887 c => {
888 let pos = self.pos;
889 let mut err = self.struct_err_span_char(escaped_pos,
890 pos,
891 if ascii_only {
892 "unknown byte escape"
893 } else {
894 "unknown character \
895 escape"
896 },
897 c);
898 if e == '\r' {
899 err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
900 "this is an isolated carriage return; consider \
901 checking your editor and version control \
902 settings");
903 }
904 if (e == '{' || e == '}') && !ascii_only {
905 err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
906 "if used in a formatting string, curly braces \
907 are escaped with `{{` and `}}`");
908 }
909 err.emit();
910 false
911 }
912 }
913 }
914 }
915 }
916 '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
917 let pos = self.pos;
918 self.err_span_char(start,
919 pos,
920 if ascii_only {
921 "byte constant must be escaped"
922 } else {
923 "character constant must be escaped"
924 },
925 first_source_char);
926 return false;
927 }
928 '\r' => {
929 if self.ch_is('\n') {
930 self.bump();
931 return true;
932 } else {
933 self.err_span_(start,
934 self.pos,
935 "bare CR not allowed in string, use \\r instead");
936 return false;
937 }
938 }
939 _ => {
940 if ascii_only && first_source_char > '\x7F' {
941 let pos = self.pos;
942 self.err_span_(start,
943 pos,
944 "byte constant must be ASCII. Use a \\xHH escape for a \
945 non-ASCII byte");
946 return false;
947 }
948 }
949 }
950 true
951 }
952
953 /// Scan over a \u{...} escape
954 ///
955 /// At this point, we have already seen the \ and the u, the { is the current character. We
956 /// will read at least one digit, and up to 6, and pass over the }.
957 fn scan_unicode_escape(&mut self, delim: char) -> bool {
958 self.bump(); // past the {
959 let start_bpos = self.pos;
960 let mut count = 0;
961 let mut accum_int = 0;
962 let mut valid = true;
963
964 while !self.ch_is('}') && count <= 6 {
965 let c = match self.ch {
966 Some(c) => c,
967 None => {
968 panic!(self.fatal_span_(start_bpos,
969 self.pos,
970 "unterminated unicode escape (found EOF)"));
971 }
972 };
973 accum_int *= 16;
974 accum_int += c.to_digit(16).unwrap_or_else(|| {
975 if c == delim {
976 panic!(self.fatal_span_(self.pos,
977 self.next_pos,
978 "unterminated unicode escape (needed a `}`)"));
979 } else {
980 self.err_span_char(self.pos,
981 self.next_pos,
982 "invalid character in unicode escape",
983 c);
984 }
985 valid = false;
986 0
987 });
988 self.bump();
989 count += 1;
990 }
991
992 if count > 6 {
993 self.err_span_(start_bpos,
994 self.pos,
995 "overlong unicode escape (can have at most 6 hex digits)");
996 valid = false;
997 }
998
999 if valid && (char::from_u32(accum_int).is_none() || count == 0) {
1000 self.err_span_(start_bpos,
1001 self.pos,
1002 "invalid unicode character escape");
1003 valid = false;
1004 }
1005
1006 self.bump(); // past the ending }
1007 valid
1008 }
1009
1010 /// Scan over a float exponent.
1011 fn scan_float_exponent(&mut self) {
1012 if self.ch_is('e') || self.ch_is('E') {
1013 self.bump();
1014 if self.ch_is('-') || self.ch_is('+') {
1015 self.bump();
1016 }
1017 if self.scan_digits(10, 10) == 0 {
1018 self.err_span_(self.pos,
1019 self.next_pos,
1020 "expected at least one digit in exponent")
1021 }
1022 }
1023 }
1024
1025 /// Check that a base is valid for a floating literal, emitting a nice
1026 /// error if it isn't.
1027 fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1028 match base {
1029 16 => {
1030 self.err_span_(start_bpos,
1031 last_bpos,
1032 "hexadecimal float literal is not supported")
1033 }
1034 8 => {
1035 self.err_span_(start_bpos,
1036 last_bpos,
1037 "octal float literal is not supported")
1038 }
1039 2 => {
1040 self.err_span_(start_bpos,
1041 last_bpos,
1042 "binary float literal is not supported")
1043 }
1044 _ => (),
1045 }
1046 }
1047
1048 fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1049 self.bump();
1050 if self.ch_is('=') {
1051 self.bump();
1052 return token::BinOpEq(op);
1053 } else {
1054 return token::BinOp(op);
1055 }
1056 }
1057
1058 /// Return the next token from the string, advances the input past that
1059 /// token, and updates the interner
1060 fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1061 let c = self.ch;
1062 if ident_start(c) &&
1063 match (c.unwrap(), self.nextch(), self.nextnextch()) {
1064 // Note: r as in r" or r#" is part of a raw string literal,
1065 // b as in b' is part of a byte literal.
1066 // They are not identifiers, and are handled further down.
1067 ('r', Some('"'), _) |
1068 ('r', Some('#'), _) |
1069 ('b', Some('"'), _) |
1070 ('b', Some('\''), _) |
1071 ('b', Some('r'), Some('"')) |
1072 ('b', Some('r'), Some('#')) => false,
1073 _ => true,
1074 } {
1075 let start = self.pos;
1076 while ident_continue(self.ch) {
1077 self.bump();
1078 }
1079
1080 return Ok(self.with_str_from(start, |string| {
1081 if string == "_" {
1082 token::Underscore
1083 } else {
1084 // FIXME: perform NFKC normalization here. (Issue #2253)
1085 token::Ident(Ident::from_str(string))
1086 }
1087 }));
1088 }
1089
1090 if is_dec_digit(c) {
1091 let num = self.scan_number(c.unwrap());
1092 let suffix = self.scan_optional_raw_name();
1093 debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1094 return Ok(token::Literal(num, suffix));
1095 }
1096
1097 match c.expect("next_token_inner called at EOF") {
1098 // One-byte tokens.
1099 ';' => {
1100 self.bump();
1101 return Ok(token::Semi);
1102 }
1103 ',' => {
1104 self.bump();
1105 return Ok(token::Comma);
1106 }
1107 '.' => {
1108 self.bump();
1109 return if self.ch_is('.') {
1110 self.bump();
1111 if self.ch_is('.') {
1112 self.bump();
1113 Ok(token::DotDotDot)
1114 } else {
1115 Ok(token::DotDot)
1116 }
1117 } else {
1118 Ok(token::Dot)
1119 };
1120 }
1121 '(' => {
1122 self.bump();
1123 return Ok(token::OpenDelim(token::Paren));
1124 }
1125 ')' => {
1126 self.bump();
1127 return Ok(token::CloseDelim(token::Paren));
1128 }
1129 '{' => {
1130 self.bump();
1131 return Ok(token::OpenDelim(token::Brace));
1132 }
1133 '}' => {
1134 self.bump();
1135 return Ok(token::CloseDelim(token::Brace));
1136 }
1137 '[' => {
1138 self.bump();
1139 return Ok(token::OpenDelim(token::Bracket));
1140 }
1141 ']' => {
1142 self.bump();
1143 return Ok(token::CloseDelim(token::Bracket));
1144 }
1145 '@' => {
1146 self.bump();
1147 return Ok(token::At);
1148 }
1149 '#' => {
1150 self.bump();
1151 return Ok(token::Pound);
1152 }
1153 '~' => {
1154 self.bump();
1155 return Ok(token::Tilde);
1156 }
1157 '?' => {
1158 self.bump();
1159 return Ok(token::Question);
1160 }
1161 ':' => {
1162 self.bump();
1163 if self.ch_is(':') {
1164 self.bump();
1165 return Ok(token::ModSep);
1166 } else {
1167 return Ok(token::Colon);
1168 }
1169 }
1170
1171 '$' => {
1172 self.bump();
1173 return Ok(token::Dollar);
1174 }
1175
1176 // Multi-byte tokens.
1177 '=' => {
1178 self.bump();
1179 if self.ch_is('=') {
1180 self.bump();
1181 return Ok(token::EqEq);
1182 } else if self.ch_is('>') {
1183 self.bump();
1184 return Ok(token::FatArrow);
1185 } else {
1186 return Ok(token::Eq);
1187 }
1188 }
1189 '!' => {
1190 self.bump();
1191 if self.ch_is('=') {
1192 self.bump();
1193 return Ok(token::Ne);
1194 } else {
1195 return Ok(token::Not);
1196 }
1197 }
1198 '<' => {
1199 self.bump();
1200 match self.ch.unwrap_or('\x00') {
1201 '=' => {
1202 self.bump();
1203 return Ok(token::Le);
1204 }
1205 '<' => {
1206 return Ok(self.binop(token::Shl));
1207 }
1208 '-' => {
1209 self.bump();
1210 match self.ch.unwrap_or('\x00') {
1211 _ => {
1212 return Ok(token::LArrow);
1213 }
1214 }
1215 }
1216 _ => {
1217 return Ok(token::Lt);
1218 }
1219 }
1220 }
1221 '>' => {
1222 self.bump();
1223 match self.ch.unwrap_or('\x00') {
1224 '=' => {
1225 self.bump();
1226 return Ok(token::Ge);
1227 }
1228 '>' => {
1229 return Ok(self.binop(token::Shr));
1230 }
1231 _ => {
1232 return Ok(token::Gt);
1233 }
1234 }
1235 }
1236 '\'' => {
1237 // Either a character constant 'a' OR a lifetime name 'abc
1238 let start_with_quote = self.pos;
1239 self.bump();
1240 let start = self.pos;
1241
1242 // the eof will be picked up by the final `'` check below
1243 let c2 = self.ch.unwrap_or('\x00');
1244 self.bump();
1245
1246 // If the character is an ident start not followed by another single
1247 // quote, then this is a lifetime name:
1248 if ident_start(Some(c2)) && !self.ch_is('\'') {
1249 while ident_continue(self.ch) {
1250 self.bump();
1251 }
1252 // lifetimes shouldn't end with a single quote
1253 // if we find one, then this is an invalid character literal
1254 if self.ch_is('\'') {
1255 panic!(self.fatal_span_verbose(
1256 start_with_quote, self.next_pos,
1257 String::from("character literal may only contain one codepoint")));
1258
1259 }
1260
1261 // Include the leading `'` in the real identifier, for macro
1262 // expansion purposes. See #12512 for the gory details of why
1263 // this is necessary.
1264 let ident = self.with_str_from(start, |lifetime_name| {
1265 Ident::from_str(&format!("'{}", lifetime_name))
1266 });
1267
1268 // Conjure up a "keyword checking ident" to make sure that
1269 // the lifetime name is not a keyword.
1270 let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1271 Ident::from_str(lifetime_name)
1272 });
1273 let keyword_checking_token = &token::Ident(keyword_checking_ident);
1274 let last_bpos = self.pos;
1275 if keyword_checking_token.is_any_keyword() &&
1276 !keyword_checking_token.is_keyword(keywords::Static) {
1277 self.err_span_(start, last_bpos, "lifetimes cannot use keyword names");
1278 }
1279
1280 return Ok(token::Lifetime(ident));
1281 }
1282
1283 let valid = self.scan_char_or_byte(start,
1284 c2,
1285 // ascii_only =
1286 false,
1287 '\'');
1288
1289 if !self.ch_is('\'') {
1290 panic!(self.fatal_span_verbose(
1291 start_with_quote, self.pos,
1292 String::from("character literal may only contain one codepoint")));
1293 }
1294
1295 let id = if valid {
1296 self.name_from(start)
1297 } else {
1298 Symbol::intern("0")
1299 };
1300 self.bump(); // advance ch past token
1301 let suffix = self.scan_optional_raw_name();
1302 return Ok(token::Literal(token::Char(id), suffix));
1303 }
1304 'b' => {
1305 self.bump();
1306 let lit = match self.ch {
1307 Some('\'') => self.scan_byte(),
1308 Some('"') => self.scan_byte_string(),
1309 Some('r') => self.scan_raw_byte_string(),
1310 _ => unreachable!(), // Should have been a token::Ident above.
1311 };
1312 let suffix = self.scan_optional_raw_name();
1313 return Ok(token::Literal(lit, suffix));
1314 }
1315 '"' => {
1316 let start_bpos = self.pos;
1317 let mut valid = true;
1318 self.bump();
1319 while !self.ch_is('"') {
1320 if self.is_eof() {
1321 let last_bpos = self.pos;
1322 panic!(self.fatal_span_(start_bpos,
1323 last_bpos,
1324 "unterminated double quote string"));
1325 }
1326
1327 let ch_start = self.pos;
1328 let ch = self.ch.unwrap();
1329 self.bump();
1330 valid &= self.scan_char_or_byte(ch_start,
1331 ch,
1332 // ascii_only =
1333 false,
1334 '"');
1335 }
1336 // adjust for the ASCII " at the start of the literal
1337 let id = if valid {
1338 self.name_from(start_bpos + BytePos(1))
1339 } else {
1340 Symbol::intern("??")
1341 };
1342 self.bump();
1343 let suffix = self.scan_optional_raw_name();
1344 return Ok(token::Literal(token::Str_(id), suffix));
1345 }
1346 'r' => {
1347 let start_bpos = self.pos;
1348 self.bump();
1349 let mut hash_count = 0;
1350 while self.ch_is('#') {
1351 self.bump();
1352 hash_count += 1;
1353 }
1354
1355 if self.is_eof() {
1356 let last_bpos = self.pos;
1357 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1358 } else if !self.ch_is('"') {
1359 let last_bpos = self.pos;
1360 let curr_char = self.ch.unwrap();
1361 panic!(self.fatal_span_char(start_bpos,
1362 last_bpos,
1363 "found invalid character; only `#` is allowed \
1364 in raw string delimitation",
1365 curr_char));
1366 }
1367 self.bump();
1368 let content_start_bpos = self.pos;
1369 let mut content_end_bpos;
1370 let mut valid = true;
1371 'outer: loop {
1372 if self.is_eof() {
1373 let last_bpos = self.pos;
1374 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1375 }
1376 // if self.ch_is('"') {
1377 // content_end_bpos = self.pos;
1378 // for _ in 0..hash_count {
1379 // self.bump();
1380 // if !self.ch_is('#') {
1381 // continue 'outer;
1382 let c = self.ch.unwrap();
1383 match c {
1384 '"' => {
1385 content_end_bpos = self.pos;
1386 for _ in 0..hash_count {
1387 self.bump();
1388 if !self.ch_is('#') {
1389 continue 'outer;
1390 }
1391 }
1392 break;
1393 }
1394 '\r' => {
1395 if !self.nextch_is('\n') {
1396 let last_bpos = self.pos;
1397 self.err_span_(start_bpos,
1398 last_bpos,
1399 "bare CR not allowed in raw string, use \\r \
1400 instead");
1401 valid = false;
1402 }
1403 }
1404 _ => (),
1405 }
1406 self.bump();
1407 }
1408 self.bump();
1409 let id = if valid {
1410 self.name_from_to(content_start_bpos, content_end_bpos)
1411 } else {
1412 Symbol::intern("??")
1413 };
1414 let suffix = self.scan_optional_raw_name();
1415 return Ok(token::Literal(token::StrRaw(id, hash_count), suffix));
1416 }
1417 '-' => {
1418 if self.nextch_is('>') {
1419 self.bump();
1420 self.bump();
1421 return Ok(token::RArrow);
1422 } else {
1423 return Ok(self.binop(token::Minus));
1424 }
1425 }
1426 '&' => {
1427 if self.nextch_is('&') {
1428 self.bump();
1429 self.bump();
1430 return Ok(token::AndAnd);
1431 } else {
1432 return Ok(self.binop(token::And));
1433 }
1434 }
1435 '|' => {
1436 match self.nextch() {
1437 Some('|') => {
1438 self.bump();
1439 self.bump();
1440 return Ok(token::OrOr);
1441 }
1442 _ => {
1443 return Ok(self.binop(token::Or));
1444 }
1445 }
1446 }
1447 '+' => {
1448 return Ok(self.binop(token::Plus));
1449 }
1450 '*' => {
1451 return Ok(self.binop(token::Star));
1452 }
1453 '/' => {
1454 return Ok(self.binop(token::Slash));
1455 }
1456 '^' => {
1457 return Ok(self.binop(token::Caret));
1458 }
1459 '%' => {
1460 return Ok(self.binop(token::Percent));
1461 }
1462 c => {
1463 let last_bpos = self.pos;
1464 let bpos = self.next_pos;
1465 let mut err = self.struct_fatal_span_char(last_bpos,
1466 bpos,
1467 "unknown start of token",
1468 c);
1469 unicode_chars::check_for_substitution(&self, c, &mut err);
1470 self.fatal_errs.push(err);
1471 Err(())
1472 }
1473 }
1474 }
1475
1476 fn consume_whitespace(&mut self) {
1477 while is_pattern_whitespace(self.ch) && !self.is_eof() {
1478 self.bump();
1479 }
1480 }
1481
1482 fn read_to_eol(&mut self) -> String {
1483 let mut val = String::new();
1484 while !self.ch_is('\n') && !self.is_eof() {
1485 val.push(self.ch.unwrap());
1486 self.bump();
1487 }
1488 if self.ch_is('\n') {
1489 self.bump();
1490 }
1491 return val;
1492 }
1493
1494 fn read_one_line_comment(&mut self) -> String {
1495 let val = self.read_to_eol();
1496 assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1497 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1498 return val;
1499 }
1500
1501 fn consume_non_eol_whitespace(&mut self) {
1502 while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1503 self.bump();
1504 }
1505 }
1506
1507 fn peeking_at_comment(&self) -> bool {
1508 (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1509 // consider shebangs comments, but not inner attributes
1510 (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1511 }
1512
1513 fn scan_byte(&mut self) -> token::Lit {
1514 self.bump();
1515 let start = self.pos;
1516
1517 // the eof will be picked up by the final `'` check below
1518 let c2 = self.ch.unwrap_or('\x00');
1519 self.bump();
1520
1521 let valid = self.scan_char_or_byte(start,
1522 c2,
1523 // ascii_only =
1524 true,
1525 '\'');
1526 if !self.ch_is('\'') {
1527 // Byte offsetting here is okay because the
1528 // character before position `start` are an
1529 // ascii single quote and ascii 'b'.
1530 let pos = self.pos;
1531 panic!(self.fatal_span_verbose(start - BytePos(2),
1532 pos,
1533 "unterminated byte constant".to_string()));
1534 }
1535
1536 let id = if valid {
1537 self.name_from(start)
1538 } else {
1539 Symbol::intern("?")
1540 };
1541 self.bump(); // advance ch past token
1542 return token::Byte(id);
1543 }
1544
1545 fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1546 self.scan_hex_digits(2, delim, below_0x7f_only)
1547 }
1548
1549 fn scan_byte_string(&mut self) -> token::Lit {
1550 self.bump();
1551 let start = self.pos;
1552 let mut valid = true;
1553
1554 while !self.ch_is('"') {
1555 if self.is_eof() {
1556 let pos = self.pos;
1557 panic!(self.fatal_span_(start, pos, "unterminated double quote byte string"));
1558 }
1559
1560 let ch_start = self.pos;
1561 let ch = self.ch.unwrap();
1562 self.bump();
1563 valid &= self.scan_char_or_byte(ch_start,
1564 ch,
1565 // ascii_only =
1566 true,
1567 '"');
1568 }
1569 let id = if valid {
1570 self.name_from(start)
1571 } else {
1572 Symbol::intern("??")
1573 };
1574 self.bump();
1575 return token::ByteStr(id);
1576 }
1577
1578 fn scan_raw_byte_string(&mut self) -> token::Lit {
1579 let start_bpos = self.pos;
1580 self.bump();
1581 let mut hash_count = 0;
1582 while self.ch_is('#') {
1583 self.bump();
1584 hash_count += 1;
1585 }
1586
1587 if self.is_eof() {
1588 let pos = self.pos;
1589 panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"));
1590 } else if !self.ch_is('"') {
1591 let pos = self.pos;
1592 let ch = self.ch.unwrap();
1593 panic!(self.fatal_span_char(start_bpos,
1594 pos,
1595 "found invalid character; only `#` is allowed in raw \
1596 string delimitation",
1597 ch));
1598 }
1599 self.bump();
1600 let content_start_bpos = self.pos;
1601 let mut content_end_bpos;
1602 'outer: loop {
1603 match self.ch {
1604 None => {
1605 let pos = self.pos;
1606 panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"))
1607 }
1608 Some('"') => {
1609 content_end_bpos = self.pos;
1610 for _ in 0..hash_count {
1611 self.bump();
1612 if !self.ch_is('#') {
1613 continue 'outer;
1614 }
1615 }
1616 break;
1617 }
1618 Some(c) => {
1619 if c > '\x7F' {
1620 let pos = self.pos;
1621 self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1622 }
1623 }
1624 }
1625 self.bump();
1626 }
1627 self.bump();
1628 return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1629 hash_count);
1630 }
1631 }
1632
1633 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1634 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1635 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1636 c.map_or(false, Pattern_White_Space)
1637 }
1638
1639 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1640 match c {
1641 Some(c) => lo <= c && c <= hi,
1642 _ => false,
1643 }
1644 }
1645
1646 fn is_dec_digit(c: Option<char>) -> bool {
1647 return in_range(c, '0', '9');
1648 }
1649
1650 pub fn is_doc_comment(s: &str) -> bool {
1651 let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1652 s.starts_with("//!");
1653 debug!("is {:?} a doc comment? {}", s, res);
1654 res
1655 }
1656
1657 pub fn is_block_doc_comment(s: &str) -> bool {
1658 // Prevent `/**/` from being parsed as a doc comment
1659 let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1660 s.starts_with("/*!")) && s.len() >= 5;
1661 debug!("is {:?} a doc comment? {}", s, res);
1662 res
1663 }
1664
1665 fn ident_start(c: Option<char>) -> bool {
1666 let c = match c {
1667 Some(c) => c,
1668 None => return false,
1669 };
1670
1671 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1672 }
1673
1674 fn ident_continue(c: Option<char>) -> bool {
1675 let c = match c {
1676 Some(c) => c,
1677 None => return false,
1678 };
1679
1680 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1681 (c > '\x7f' && c.is_xid_continue())
1682 }
1683
1684 #[cfg(test)]
1685 mod tests {
1686 use super::*;
1687
1688 use ast::{Ident, CrateConfig};
1689 use symbol::Symbol;
1690 use syntax_pos::{BytePos, Span, NO_EXPANSION};
1691 use codemap::CodeMap;
1692 use errors;
1693 use feature_gate::UnstableFeatures;
1694 use parse::token;
1695 use std::cell::RefCell;
1696 use std::collections::HashSet;
1697 use std::io;
1698 use std::rc::Rc;
1699
1700 fn mk_sess(cm: Rc<CodeMap>) -> ParseSess {
1701 let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), Some(cm.clone()));
1702 ParseSess {
1703 span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1704 unstable_features: UnstableFeatures::from_environment(),
1705 config: CrateConfig::new(),
1706 included_mod_stack: RefCell::new(Vec::new()),
1707 code_map: cm,
1708 missing_fragment_specifiers: RefCell::new(HashSet::new()),
1709 }
1710 }
1711
1712 // open a string reader for the given string
1713 fn setup<'a>(cm: &CodeMap,
1714 sess: &'a ParseSess,
1715 teststr: String)
1716 -> StringReader<'a> {
1717 let fm = cm.new_filemap("zebra.rs".to_string(), None, teststr);
1718 StringReader::new(sess, fm)
1719 }
1720
1721 #[test]
1722 fn t1() {
1723 let cm = Rc::new(CodeMap::new());
1724 let sh = mk_sess(cm.clone());
1725 let mut string_reader = setup(&cm,
1726 &sh,
1727 "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1728 .to_string());
1729 let id = Ident::from_str("fn");
1730 assert_eq!(string_reader.next_token().tok, token::Comment);
1731 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1732 let tok1 = string_reader.next_token();
1733 let tok2 = TokenAndSpan {
1734 tok: token::Ident(id),
1735 sp: Span {
1736 lo: BytePos(21),
1737 hi: BytePos(23),
1738 expn_id: NO_EXPANSION,
1739 },
1740 };
1741 assert_eq!(tok1, tok2);
1742 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1743 // the 'main' id is already read:
1744 assert_eq!(string_reader.pos.clone(), BytePos(28));
1745 // read another token:
1746 let tok3 = string_reader.next_token();
1747 let tok4 = TokenAndSpan {
1748 tok: token::Ident(Ident::from_str("main")),
1749 sp: Span {
1750 lo: BytePos(24),
1751 hi: BytePos(28),
1752 expn_id: NO_EXPANSION,
1753 },
1754 };
1755 assert_eq!(tok3, tok4);
1756 // the lparen is already read:
1757 assert_eq!(string_reader.pos.clone(), BytePos(29))
1758 }
1759
1760 // check that the given reader produces the desired stream
1761 // of tokens (stop checking after exhausting the expected vec)
1762 fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1763 for expected_tok in &expected {
1764 assert_eq!(&string_reader.next_token().tok, expected_tok);
1765 }
1766 }
1767
1768 // make the identifier by looking up the string in the interner
1769 fn mk_ident(id: &str) -> token::Token {
1770 token::Ident(Ident::from_str(id))
1771 }
1772
1773 #[test]
1774 fn doublecolonparsing() {
1775 let cm = Rc::new(CodeMap::new());
1776 let sh = mk_sess(cm.clone());
1777 check_tokenization(setup(&cm, &sh, "a b".to_string()),
1778 vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1779 }
1780
1781 #[test]
1782 fn dcparsing_2() {
1783 let cm = Rc::new(CodeMap::new());
1784 let sh = mk_sess(cm.clone());
1785 check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1786 vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1787 }
1788
1789 #[test]
1790 fn dcparsing_3() {
1791 let cm = Rc::new(CodeMap::new());
1792 let sh = mk_sess(cm.clone());
1793 check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1794 vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1795 }
1796
1797 #[test]
1798 fn dcparsing_4() {
1799 let cm = Rc::new(CodeMap::new());
1800 let sh = mk_sess(cm.clone());
1801 check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1802 vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1803 }
1804
1805 #[test]
1806 fn character_a() {
1807 let cm = Rc::new(CodeMap::new());
1808 let sh = mk_sess(cm.clone());
1809 assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1810 token::Literal(token::Char(Symbol::intern("a")), None));
1811 }
1812
1813 #[test]
1814 fn character_space() {
1815 let cm = Rc::new(CodeMap::new());
1816 let sh = mk_sess(cm.clone());
1817 assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1818 token::Literal(token::Char(Symbol::intern(" ")), None));
1819 }
1820
1821 #[test]
1822 fn character_escaped() {
1823 let cm = Rc::new(CodeMap::new());
1824 let sh = mk_sess(cm.clone());
1825 assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1826 token::Literal(token::Char(Symbol::intern("\\n")), None));
1827 }
1828
1829 #[test]
1830 fn lifetime_name() {
1831 let cm = Rc::new(CodeMap::new());
1832 let sh = mk_sess(cm.clone());
1833 assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1834 token::Lifetime(Ident::from_str("'abc")));
1835 }
1836
1837 #[test]
1838 fn raw_string() {
1839 let cm = Rc::new(CodeMap::new());
1840 let sh = mk_sess(cm.clone());
1841 assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1842 .next_token()
1843 .tok,
1844 token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1845 }
1846
1847 #[test]
1848 fn literal_suffixes() {
1849 let cm = Rc::new(CodeMap::new());
1850 let sh = mk_sess(cm.clone());
1851 macro_rules! test {
1852 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1853 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1854 token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1855 Some(Symbol::intern("suffix"))));
1856 // with a whitespace separator:
1857 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1858 token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1859 None));
1860 }}
1861 }
1862
1863 test!("'a'", Char, "a");
1864 test!("b'a'", Byte, "a");
1865 test!("\"a\"", Str_, "a");
1866 test!("b\"a\"", ByteStr, "a");
1867 test!("1234", Integer, "1234");
1868 test!("0b101", Integer, "0b101");
1869 test!("0xABC", Integer, "0xABC");
1870 test!("1.0", Float, "1.0");
1871 test!("1.0e10", Float, "1.0e10");
1872
1873 assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1874 token::Literal(token::Integer(Symbol::intern("2")),
1875 Some(Symbol::intern("us"))));
1876 assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1877 token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1878 Some(Symbol::intern("suffix"))));
1879 assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1880 token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1881 Some(Symbol::intern("suffix"))));
1882 }
1883
1884 #[test]
1885 fn line_doc_comments() {
1886 assert!(is_doc_comment("///"));
1887 assert!(is_doc_comment("/// blah"));
1888 assert!(!is_doc_comment("////"));
1889 }
1890
1891 #[test]
1892 fn nested_block_comments() {
1893 let cm = Rc::new(CodeMap::new());
1894 let sh = mk_sess(cm.clone());
1895 let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1896 match lexer.next_token().tok {
1897 token::Comment => {}
1898 _ => panic!("expected a comment!"),
1899 }
1900 assert_eq!(lexer.next_token().tok,
1901 token::Literal(token::Char(Symbol::intern("a")), None));
1902 }
1903
1904 #[test]
1905 fn crlf_comments() {
1906 let cm = Rc::new(CodeMap::new());
1907 let sh = mk_sess(cm.clone());
1908 let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1909 let comment = lexer.next_token();
1910 assert_eq!(comment.tok, token::Comment);
1911 assert_eq!(comment.sp, ::syntax_pos::mk_sp(BytePos(0), BytePos(7)));
1912 assert_eq!(lexer.next_token().tok, token::Whitespace);
1913 assert_eq!(lexer.next_token().tok,
1914 token::DocComment(Symbol::intern("/// test")));
1915 }
1916 }