]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
476ff2be | 11 | use ast::{self, Ident}; |
3157f602 XL |
12 | use syntax_pos::{self, BytePos, CharPos, Pos, Span}; |
13 | use codemap::CodeMap; | |
9cc50fc6 | 14 | use errors::{FatalError, Handler, DiagnosticBuilder}; |
1a4d82fc | 15 | use ext::tt::transcribe::tt_next_token; |
476ff2be | 16 | use parse::token; |
d9579d0f | 17 | use str::char_at; |
476ff2be SL |
18 | use symbol::{Symbol, keywords}; |
19 | use std_unicode::property::Pattern_White_Space; | |
1a4d82fc | 20 | |
d9579d0f | 21 | use std::borrow::Cow; |
1a4d82fc | 22 | use std::char; |
1a4d82fc | 23 | use std::mem::replace; |
1a4d82fc | 24 | use std::rc::Rc; |
1a4d82fc | 25 | |
c30ab7b3 | 26 | pub use ext::tt::transcribe::{TtReader, new_tt_reader}; |
1a4d82fc JJ |
27 | |
28 | pub mod comments; | |
92a42be0 | 29 | mod unicode_chars; |
1a4d82fc JJ |
30 | |
31 | pub trait Reader { | |
32 | fn is_eof(&self) -> bool; | |
a7813a04 XL |
33 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()>; |
34 | fn next_token(&mut self) -> TokenAndSpan where Self: Sized { | |
35 | let res = self.try_next_token(); | |
36 | self.unwrap_or_abort(res) | |
37 | } | |
1a4d82fc | 38 | /// Report a fatal error with the current span. |
92a42be0 | 39 | fn fatal(&self, &str) -> FatalError; |
1a4d82fc JJ |
40 | /// Report a non-fatal error with the current span. |
41 | fn err(&self, &str); | |
a7813a04 XL |
42 | fn emit_fatal_errors(&mut self); |
43 | fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan { | |
44 | match res { | |
45 | Ok(tok) => tok, | |
46 | Err(_) => { | |
47 | self.emit_fatal_errors(); | |
48 | panic!(FatalError); | |
49 | } | |
50 | } | |
51 | } | |
1a4d82fc JJ |
52 | fn peek(&self) -> TokenAndSpan; |
53 | /// Get a token the parser cares about. | |
a7813a04 XL |
54 | fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> { |
55 | let mut t = self.try_next_token()?; | |
1a4d82fc JJ |
56 | loop { |
57 | match t.tok { | |
58 | token::Whitespace | token::Comment | token::Shebang(_) => { | |
a7813a04 | 59 | t = self.try_next_token()?; |
9cc50fc6 SL |
60 | } |
61 | _ => break, | |
1a4d82fc JJ |
62 | } |
63 | } | |
a7813a04 XL |
64 | Ok(t) |
65 | } | |
66 | fn real_token(&mut self) -> TokenAndSpan { | |
67 | let res = self.try_real_token(); | |
68 | self.unwrap_or_abort(res) | |
1a4d82fc JJ |
69 | } |
70 | } | |
71 | ||
85aaf69f | 72 | #[derive(Clone, PartialEq, Eq, Debug)] |
1a4d82fc JJ |
73 | pub struct TokenAndSpan { |
74 | pub tok: token::Token, | |
75 | pub sp: Span, | |
76 | } | |
77 | ||
c30ab7b3 SL |
78 | impl Default for TokenAndSpan { |
79 | fn default() -> Self { | |
80 | TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP } | |
81 | } | |
82 | } | |
83 | ||
1a4d82fc | 84 | pub struct StringReader<'a> { |
9cc50fc6 | 85 | pub span_diagnostic: &'a Handler, |
1a4d82fc | 86 | /// The absolute offset within the codemap of the next character to read |
c30ab7b3 SL |
87 | pub next_pos: BytePos, |
88 | /// The absolute offset within the codemap of the current character | |
1a4d82fc | 89 | pub pos: BytePos, |
1a4d82fc JJ |
90 | /// The column of the next character to read |
91 | pub col: CharPos, | |
c30ab7b3 SL |
92 | /// The current character (which has been read from self.pos) |
93 | pub ch: Option<char>, | |
3157f602 | 94 | pub filemap: Rc<syntax_pos::FileMap>, |
9e0c209e SL |
95 | /// If Some, stop reading the source at this position (inclusive). |
96 | pub terminator: Option<BytePos>, | |
97 | /// Whether to record new-lines in filemap. This is only necessary the first | |
98 | /// time a filemap is lexed. If part of a filemap is being re-lexed, this | |
99 | /// should be set to false. | |
100 | pub save_new_lines: bool, | |
9cc50fc6 | 101 | // cached: |
1a4d82fc JJ |
102 | pub peek_tok: token::Token, |
103 | pub peek_span: Span, | |
a7813a04 | 104 | pub fatal_errs: Vec<DiagnosticBuilder<'a>>, |
c34b1796 AL |
105 | // cache a direct reference to the source text, so that we don't have to |
106 | // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time. | |
9cc50fc6 | 107 | source_text: Rc<String>, |
1a4d82fc JJ |
108 | } |
109 | ||
110 | impl<'a> Reader for StringReader<'a> { | |
9cc50fc6 | 111 | fn is_eof(&self) -> bool { |
c30ab7b3 | 112 | if self.ch.is_none() { |
9e0c209e SL |
113 | return true; |
114 | } | |
115 | ||
116 | match self.terminator { | |
c30ab7b3 | 117 | Some(t) => self.next_pos > t, |
9e0c209e SL |
118 | None => false, |
119 | } | |
9cc50fc6 | 120 | } |
1a4d82fc | 121 | /// Return the next token. EFFECT: advances the string_reader. |
a7813a04 XL |
122 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> { |
123 | assert!(self.fatal_errs.is_empty()); | |
1a4d82fc JJ |
124 | let ret_val = TokenAndSpan { |
125 | tok: replace(&mut self.peek_tok, token::Underscore), | |
126 | sp: self.peek_span, | |
127 | }; | |
a7813a04 XL |
128 | self.advance_token()?; |
129 | Ok(ret_val) | |
1a4d82fc | 130 | } |
92a42be0 | 131 | fn fatal(&self, m: &str) -> FatalError { |
1a4d82fc JJ |
132 | self.fatal_span(self.peek_span, m) |
133 | } | |
134 | fn err(&self, m: &str) { | |
135 | self.err_span(self.peek_span, m) | |
136 | } | |
a7813a04 XL |
137 | fn emit_fatal_errors(&mut self) { |
138 | for err in &mut self.fatal_errs { | |
139 | err.emit(); | |
140 | } | |
141 | self.fatal_errs.clear(); | |
142 | } | |
1a4d82fc JJ |
143 | fn peek(&self) -> TokenAndSpan { |
144 | // FIXME(pcwalton): Bad copy! | |
145 | TokenAndSpan { | |
146 | tok: self.peek_tok.clone(), | |
147 | sp: self.peek_span, | |
148 | } | |
149 | } | |
150 | } | |
151 | ||
152 | impl<'a> Reader for TtReader<'a> { | |
153 | fn is_eof(&self) -> bool { | |
c30ab7b3 | 154 | self.peek().tok == token::Eof |
1a4d82fc | 155 | } |
a7813a04 XL |
156 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> { |
157 | assert!(self.fatal_errs.is_empty()); | |
1a4d82fc JJ |
158 | let r = tt_next_token(self); |
159 | debug!("TtReader: r={:?}", r); | |
a7813a04 | 160 | Ok(r) |
1a4d82fc | 161 | } |
92a42be0 SL |
162 | fn fatal(&self, m: &str) -> FatalError { |
163 | self.sp_diag.span_fatal(self.cur_span, m) | |
1a4d82fc JJ |
164 | } |
165 | fn err(&self, m: &str) { | |
166 | self.sp_diag.span_err(self.cur_span, m); | |
167 | } | |
a7813a04 XL |
168 | fn emit_fatal_errors(&mut self) { |
169 | for err in &mut self.fatal_errs { | |
170 | err.emit(); | |
171 | } | |
172 | self.fatal_errs.clear(); | |
173 | } | |
1a4d82fc JJ |
174 | fn peek(&self) -> TokenAndSpan { |
175 | TokenAndSpan { | |
176 | tok: self.cur_tok.clone(), | |
177 | sp: self.cur_span, | |
178 | } | |
179 | } | |
180 | } | |
181 | ||
1a4d82fc | 182 | impl<'a> StringReader<'a> { |
c30ab7b3 | 183 | /// For comments.rs, which hackily pokes into next_pos and ch |
9cc50fc6 | 184 | pub fn new_raw<'b>(span_diagnostic: &'b Handler, |
3157f602 | 185 | filemap: Rc<syntax_pos::FileMap>) |
9cc50fc6 | 186 | -> StringReader<'b> { |
9e0c209e SL |
187 | let mut sr = StringReader::new_raw_internal(span_diagnostic, filemap); |
188 | sr.bump(); | |
189 | sr | |
190 | } | |
191 | ||
192 | fn new_raw_internal<'b>(span_diagnostic: &'b Handler, | |
193 | filemap: Rc<syntax_pos::FileMap>) | |
194 | -> StringReader<'b> { | |
c34b1796 | 195 | if filemap.src.is_none() { |
9cc50fc6 SL |
196 | span_diagnostic.bug(&format!("Cannot lex filemap \ |
197 | without source: {}", | |
198 | filemap.name)[..]); | |
c34b1796 AL |
199 | } |
200 | ||
201 | let source_text = (*filemap.src.as_ref().unwrap()).clone(); | |
202 | ||
9e0c209e | 203 | StringReader { |
1a4d82fc | 204 | span_diagnostic: span_diagnostic, |
c30ab7b3 | 205 | next_pos: filemap.start_pos, |
1a4d82fc | 206 | pos: filemap.start_pos, |
1a4d82fc | 207 | col: CharPos(0), |
c30ab7b3 | 208 | ch: Some('\n'), |
1a4d82fc | 209 | filemap: filemap, |
9e0c209e SL |
210 | terminator: None, |
211 | save_new_lines: true, | |
9cc50fc6 | 212 | // dummy values; not read |
1a4d82fc | 213 | peek_tok: token::Eof, |
3157f602 | 214 | peek_span: syntax_pos::DUMMY_SP, |
9cc50fc6 | 215 | source_text: source_text, |
a7813a04 | 216 | fatal_errs: Vec::new(), |
9e0c209e | 217 | } |
1a4d82fc JJ |
218 | } |
219 | ||
9cc50fc6 | 220 | pub fn new<'b>(span_diagnostic: &'b Handler, |
3157f602 | 221 | filemap: Rc<syntax_pos::FileMap>) |
9cc50fc6 | 222 | -> StringReader<'b> { |
1a4d82fc | 223 | let mut sr = StringReader::new_raw(span_diagnostic, filemap); |
a7813a04 XL |
224 | if let Err(_) = sr.advance_token() { |
225 | sr.emit_fatal_errors(); | |
226 | panic!(FatalError); | |
227 | } | |
1a4d82fc JJ |
228 | sr |
229 | } | |
230 | ||
c30ab7b3 SL |
231 | pub fn ch_is(&self, c: char) -> bool { |
232 | self.ch == Some(c) | |
1a4d82fc JJ |
233 | } |
234 | ||
235 | /// Report a fatal lexical error with a given span. | |
92a42be0 SL |
236 | pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError { |
237 | self.span_diagnostic.span_fatal(sp, m) | |
1a4d82fc JJ |
238 | } |
239 | ||
240 | /// Report a lexical error with a given span. | |
241 | pub fn err_span(&self, sp: Span, m: &str) { | |
242 | self.span_diagnostic.span_err(sp, m) | |
243 | } | |
244 | ||
c1a9b12d | 245 | |
1a4d82fc | 246 | /// Report a fatal error spanning [`from_pos`, `to_pos`). |
92a42be0 | 247 | fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError { |
3157f602 | 248 | self.fatal_span(syntax_pos::mk_sp(from_pos, to_pos), m) |
1a4d82fc JJ |
249 | } |
250 | ||
251 | /// Report a lexical error spanning [`from_pos`, `to_pos`). | |
252 | fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) { | |
3157f602 | 253 | self.err_span(syntax_pos::mk_sp(from_pos, to_pos), m) |
1a4d82fc JJ |
254 | } |
255 | ||
256 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
257 | /// escaped character to the error message | |
92a42be0 | 258 | fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError { |
1a4d82fc JJ |
259 | let mut m = m.to_string(); |
260 | m.push_str(": "); | |
9cc50fc6 SL |
261 | for c in c.escape_default() { |
262 | m.push(c) | |
263 | } | |
92a42be0 | 264 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc | 265 | } |
9cc50fc6 SL |
266 | fn struct_fatal_span_char(&self, |
267 | from_pos: BytePos, | |
268 | to_pos: BytePos, | |
269 | m: &str, | |
270 | c: char) | |
271 | -> DiagnosticBuilder<'a> { | |
272 | let mut m = m.to_string(); | |
273 | m.push_str(": "); | |
274 | for c in c.escape_default() { | |
275 | m.push(c) | |
276 | } | |
3157f602 | 277 | self.span_diagnostic.struct_span_fatal(syntax_pos::mk_sp(from_pos, to_pos), &m[..]) |
9cc50fc6 | 278 | } |
1a4d82fc JJ |
279 | |
280 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
281 | /// escaped character to the error message | |
282 | fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { | |
283 | let mut m = m.to_string(); | |
284 | m.push_str(": "); | |
9cc50fc6 SL |
285 | for c in c.escape_default() { |
286 | m.push(c) | |
287 | } | |
85aaf69f | 288 | self.err_span_(from_pos, to_pos, &m[..]); |
1a4d82fc | 289 | } |
9cc50fc6 SL |
290 | fn struct_err_span_char(&self, |
291 | from_pos: BytePos, | |
292 | to_pos: BytePos, | |
293 | m: &str, | |
294 | c: char) | |
295 | -> DiagnosticBuilder<'a> { | |
296 | let mut m = m.to_string(); | |
297 | m.push_str(": "); | |
298 | for c in c.escape_default() { | |
299 | m.push(c) | |
300 | } | |
3157f602 | 301 | self.span_diagnostic.struct_span_err(syntax_pos::mk_sp(from_pos, to_pos), &m[..]) |
9cc50fc6 | 302 | } |
1a4d82fc JJ |
303 | |
304 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the | |
305 | /// offending string to the error message | |
92a42be0 | 306 | fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError { |
1a4d82fc | 307 | m.push_str(": "); |
85aaf69f SL |
308 | let from = self.byte_offset(from_pos).to_usize(); |
309 | let to = self.byte_offset(to_pos).to_usize(); | |
c34b1796 | 310 | m.push_str(&self.source_text[from..to]); |
92a42be0 | 311 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc JJ |
312 | } |
313 | ||
314 | /// Advance peek_tok and peek_span to refer to the next token, and | |
315 | /// possibly update the interner. | |
a7813a04 | 316 | fn advance_token(&mut self) -> Result<(), ()> { |
1a4d82fc JJ |
317 | match self.scan_whitespace_or_comment() { |
318 | Some(comment) => { | |
319 | self.peek_span = comment.sp; | |
320 | self.peek_tok = comment.tok; | |
9cc50fc6 | 321 | } |
1a4d82fc JJ |
322 | None => { |
323 | if self.is_eof() { | |
324 | self.peek_tok = token::Eof; | |
3157f602 | 325 | self.peek_span = syntax_pos::mk_sp(self.filemap.end_pos, self.filemap.end_pos); |
1a4d82fc | 326 | } else { |
c30ab7b3 | 327 | let start_bytepos = self.pos; |
a7813a04 | 328 | self.peek_tok = self.next_token_inner()?; |
c30ab7b3 | 329 | self.peek_span = syntax_pos::mk_sp(start_bytepos, self.pos); |
1a4d82fc JJ |
330 | }; |
331 | } | |
332 | } | |
a7813a04 | 333 | Ok(()) |
1a4d82fc JJ |
334 | } |
335 | ||
336 | fn byte_offset(&self, pos: BytePos) -> BytePos { | |
337 | (pos - self.filemap.start_pos) | |
338 | } | |
339 | ||
340 | /// Calls `f` with a string slice of the source text spanning from `start` | |
c30ab7b3 SL |
341 | /// up to but excluding `self.pos`, meaning the slice does not include |
342 | /// the character `self.ch`. | |
9cc50fc6 SL |
343 | pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T |
344 | where F: FnOnce(&str) -> T | |
1a4d82fc | 345 | { |
c30ab7b3 | 346 | self.with_str_from_to(start, self.pos, f) |
1a4d82fc JJ |
347 | } |
348 | ||
349 | /// Create a Name from a given offset to the current offset, each | |
350 | /// adjusted 1 towards each other (assumes that on either side there is a | |
351 | /// single-byte delimiter). | |
352 | pub fn name_from(&self, start: BytePos) -> ast::Name { | |
c30ab7b3 | 353 | debug!("taking an ident from {:?} to {:?}", start, self.pos); |
476ff2be | 354 | self.with_str_from(start, Symbol::intern) |
1a4d82fc JJ |
355 | } |
356 | ||
357 | /// As name_from, with an explicit endpoint. | |
358 | pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name { | |
359 | debug!("taking an ident from {:?} to {:?}", start, end); | |
476ff2be | 360 | self.with_str_from_to(start, end, Symbol::intern) |
1a4d82fc JJ |
361 | } |
362 | ||
363 | /// Calls `f` with a string slice of the source text spanning from `start` | |
364 | /// up to but excluding `end`. | |
9cc50fc6 SL |
365 | fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T |
366 | where F: FnOnce(&str) -> T | |
1a4d82fc | 367 | { |
9cc50fc6 | 368 | f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()]) |
1a4d82fc JJ |
369 | } |
370 | ||
371 | /// Converts CRLF to LF in the given string, raising an error on bare CR. | |
9cc50fc6 | 372 | fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { |
85aaf69f | 373 | let mut i = 0; |
1a4d82fc | 374 | while i < s.len() { |
d9579d0f | 375 | let ch = char_at(s, i); |
c34b1796 | 376 | let next = i + ch.len_utf8(); |
1a4d82fc | 377 | if ch == '\r' { |
d9579d0f AL |
378 | if next < s.len() && char_at(s, next) == '\n' { |
379 | return translate_crlf_(self, start, s, errmsg, i).into(); | |
1a4d82fc JJ |
380 | } |
381 | let pos = start + BytePos(i as u32); | |
382 | let end_pos = start + BytePos(next as u32); | |
383 | self.err_span_(pos, end_pos, errmsg); | |
384 | } | |
385 | i = next; | |
386 | } | |
d9579d0f | 387 | return s.into(); |
1a4d82fc | 388 | |
9cc50fc6 SL |
389 | fn translate_crlf_(rdr: &StringReader, |
390 | start: BytePos, | |
391 | s: &str, | |
392 | errmsg: &str, | |
393 | mut i: usize) | |
394 | -> String { | |
1a4d82fc JJ |
395 | let mut buf = String::with_capacity(s.len()); |
396 | let mut j = 0; | |
397 | while i < s.len() { | |
d9579d0f | 398 | let ch = char_at(s, i); |
c34b1796 | 399 | let next = i + ch.len_utf8(); |
1a4d82fc | 400 | if ch == '\r' { |
9cc50fc6 SL |
401 | if j < i { |
402 | buf.push_str(&s[j..i]); | |
403 | } | |
1a4d82fc | 404 | j = next; |
d9579d0f | 405 | if next >= s.len() || char_at(s, next) != '\n' { |
1a4d82fc JJ |
406 | let pos = start + BytePos(i as u32); |
407 | let end_pos = start + BytePos(next as u32); | |
408 | rdr.err_span_(pos, end_pos, errmsg); | |
409 | } | |
410 | } | |
411 | i = next; | |
412 | } | |
9cc50fc6 SL |
413 | if j < s.len() { |
414 | buf.push_str(&s[j..]); | |
415 | } | |
1a4d82fc JJ |
416 | buf |
417 | } | |
418 | } | |
419 | ||
420 | ||
421 | /// Advance the StringReader by one character. If a newline is | |
422 | /// discovered, add it to the FileMap's list of line start offsets. | |
423 | pub fn bump(&mut self) { | |
c30ab7b3 SL |
424 | let new_pos = self.next_pos; |
425 | let new_byte_offset = self.byte_offset(new_pos).to_usize(); | |
426 | if new_byte_offset < self.source_text.len() { | |
427 | let old_ch_is_newline = self.ch.unwrap() == '\n'; | |
428 | let new_ch = char_at(&self.source_text, new_byte_offset); | |
429 | let new_ch_len = new_ch.len_utf8(); | |
430 | ||
431 | self.ch = Some(new_ch); | |
432 | self.pos = new_pos; | |
433 | self.next_pos = new_pos + Pos::from_usize(new_ch_len); | |
434 | if old_ch_is_newline { | |
9e0c209e | 435 | if self.save_new_lines { |
c30ab7b3 | 436 | self.filemap.next_line(self.pos); |
9e0c209e | 437 | } |
85aaf69f | 438 | self.col = CharPos(0); |
c30ab7b3 SL |
439 | } else { |
440 | self.col = self.col + CharPos(1); | |
1a4d82fc | 441 | } |
c30ab7b3 SL |
442 | if new_ch_len > 1 { |
443 | self.filemap.record_multibyte_char(self.pos, new_ch_len); | |
1a4d82fc JJ |
444 | } |
445 | } else { | |
c30ab7b3 SL |
446 | self.ch = None; |
447 | self.pos = new_pos; | |
1a4d82fc JJ |
448 | } |
449 | } | |
450 | ||
451 | pub fn nextch(&self) -> Option<char> { | |
c30ab7b3 | 452 | let offset = self.byte_offset(self.next_pos).to_usize(); |
c34b1796 | 453 | if offset < self.source_text.len() { |
d9579d0f | 454 | Some(char_at(&self.source_text, offset)) |
1a4d82fc JJ |
455 | } else { |
456 | None | |
457 | } | |
458 | } | |
459 | ||
460 | pub fn nextch_is(&self, c: char) -> bool { | |
461 | self.nextch() == Some(c) | |
462 | } | |
463 | ||
464 | pub fn nextnextch(&self) -> Option<char> { | |
c30ab7b3 | 465 | let offset = self.byte_offset(self.next_pos).to_usize(); |
c34b1796 | 466 | let s = &self.source_text[..]; |
9cc50fc6 SL |
467 | if offset >= s.len() { |
468 | return None; | |
469 | } | |
d9579d0f | 470 | let next = offset + char_at(s, offset).len_utf8(); |
1a4d82fc | 471 | if next < s.len() { |
d9579d0f | 472 | Some(char_at(s, next)) |
1a4d82fc JJ |
473 | } else { |
474 | None | |
475 | } | |
476 | } | |
477 | ||
478 | pub fn nextnextch_is(&self, c: char) -> bool { | |
479 | self.nextnextch() == Some(c) | |
480 | } | |
481 | ||
482 | /// Eats <XID_start><XID_continue>*, if possible. | |
483 | fn scan_optional_raw_name(&mut self) -> Option<ast::Name> { | |
c30ab7b3 | 484 | if !ident_start(self.ch) { |
9cc50fc6 | 485 | return None; |
1a4d82fc | 486 | } |
c30ab7b3 SL |
487 | let start = self.pos; |
488 | while ident_continue(self.ch) { | |
1a4d82fc JJ |
489 | self.bump(); |
490 | } | |
491 | ||
492 | self.with_str_from(start, |string| { | |
493 | if string == "_" { | |
494 | None | |
495 | } else { | |
476ff2be | 496 | Some(Symbol::intern(string)) |
1a4d82fc JJ |
497 | } |
498 | }) | |
499 | } | |
500 | ||
c30ab7b3 | 501 | /// PRECONDITION: self.ch is not whitespace |
1a4d82fc JJ |
502 | /// Eats any kind of comment. |
503 | fn scan_comment(&mut self) -> Option<TokenAndSpan> { | |
c30ab7b3 | 504 | if let Some(c) = self.ch { |
3157f602 | 505 | if c.is_whitespace() { |
c30ab7b3 | 506 | self.span_diagnostic.span_err(syntax_pos::mk_sp(self.pos, self.pos), |
3157f602 XL |
507 | "called consume_any_line_comment, but there \ |
508 | was whitespace"); | |
9cc50fc6 | 509 | } |
1a4d82fc JJ |
510 | } |
511 | ||
c30ab7b3 | 512 | if self.ch_is('/') { |
1a4d82fc JJ |
513 | match self.nextch() { |
514 | Some('/') => { | |
515 | self.bump(); | |
516 | self.bump(); | |
62682a34 | 517 | |
1a4d82fc | 518 | // line comments starting with "///" or "//!" are doc-comments |
c30ab7b3 SL |
519 | let doc_comment = self.ch_is('/') || self.ch_is('!'); |
520 | let start_bpos = self.pos - BytePos(2); | |
62682a34 SL |
521 | |
522 | while !self.is_eof() { | |
c30ab7b3 | 523 | match self.ch.unwrap() { |
62682a34 SL |
524 | '\n' => break, |
525 | '\r' => { | |
526 | if self.nextch_is('\n') { | |
527 | // CRLF | |
9cc50fc6 | 528 | break; |
62682a34 | 529 | } else if doc_comment { |
c30ab7b3 SL |
530 | self.err_span_(self.pos, |
531 | self.next_pos, | |
62682a34 | 532 | "bare CR not allowed in doc-comment"); |
1a4d82fc | 533 | } |
1a4d82fc | 534 | } |
9cc50fc6 | 535 | _ => (), |
1a4d82fc | 536 | } |
62682a34 SL |
537 | self.bump(); |
538 | } | |
539 | ||
540 | return if doc_comment { | |
541 | self.with_str_from(start_bpos, |string| { | |
542 | // comments with only more "/"s are not doc comments | |
1a4d82fc | 543 | let tok = if is_doc_comment(string) { |
476ff2be | 544 | token::DocComment(Symbol::intern(string)) |
1a4d82fc JJ |
545 | } else { |
546 | token::Comment | |
547 | }; | |
548 | ||
62682a34 | 549 | Some(TokenAndSpan { |
1a4d82fc | 550 | tok: tok, |
c30ab7b3 | 551 | sp: syntax_pos::mk_sp(start_bpos, self.pos), |
62682a34 SL |
552 | }) |
553 | }) | |
1a4d82fc | 554 | } else { |
62682a34 | 555 | Some(TokenAndSpan { |
1a4d82fc | 556 | tok: token::Comment, |
c30ab7b3 | 557 | sp: syntax_pos::mk_sp(start_bpos, self.pos), |
62682a34 | 558 | }) |
9cc50fc6 | 559 | }; |
1a4d82fc JJ |
560 | } |
561 | Some('*') => { | |
9cc50fc6 SL |
562 | self.bump(); |
563 | self.bump(); | |
1a4d82fc JJ |
564 | self.scan_block_comment() |
565 | } | |
9cc50fc6 | 566 | _ => None, |
1a4d82fc | 567 | } |
c30ab7b3 | 568 | } else if self.ch_is('#') { |
1a4d82fc JJ |
569 | if self.nextch_is('!') { |
570 | ||
571 | // Parse an inner attribute. | |
572 | if self.nextnextch_is('[') { | |
573 | return None; | |
574 | } | |
575 | ||
576 | // I guess this is the only way to figure out if | |
577 | // we're at the beginning of the file... | |
578 | let cmap = CodeMap::new(); | |
579 | cmap.files.borrow_mut().push(self.filemap.clone()); | |
c30ab7b3 | 580 | let loc = cmap.lookup_char_pos_adj(self.pos); |
1a4d82fc | 581 | debug!("Skipping a shebang"); |
85aaf69f | 582 | if loc.line == 1 && loc.col == CharPos(0) { |
1a4d82fc | 583 | // FIXME: Add shebang "token", return it |
c30ab7b3 SL |
584 | let start = self.pos; |
585 | while !self.ch_is('\n') && !self.is_eof() { | |
9cc50fc6 SL |
586 | self.bump(); |
587 | } | |
1a4d82fc JJ |
588 | return Some(TokenAndSpan { |
589 | tok: token::Shebang(self.name_from(start)), | |
c30ab7b3 | 590 | sp: syntax_pos::mk_sp(start, self.pos), |
1a4d82fc JJ |
591 | }); |
592 | } | |
593 | } | |
594 | None | |
595 | } else { | |
596 | None | |
597 | } | |
598 | } | |
599 | ||
600 | /// If there is whitespace, shebang, or a comment, scan it. Otherwise, | |
601 | /// return None. | |
602 | fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> { | |
c30ab7b3 | 603 | match self.ch.unwrap_or('\0') { |
1a4d82fc JJ |
604 | // # to handle shebang at start of file -- this is the entry point |
605 | // for skipping over all "junk" | |
606 | '/' | '#' => { | |
607 | let c = self.scan_comment(); | |
608 | debug!("scanning a comment {:?}", c); | |
609 | c | |
54a0048b SL |
610 | }, |
611 | c if is_pattern_whitespace(Some(c)) => { | |
c30ab7b3 SL |
612 | let start_bpos = self.pos; |
613 | while is_pattern_whitespace(self.ch) { | |
9cc50fc6 SL |
614 | self.bump(); |
615 | } | |
1a4d82fc JJ |
616 | let c = Some(TokenAndSpan { |
617 | tok: token::Whitespace, | |
c30ab7b3 | 618 | sp: syntax_pos::mk_sp(start_bpos, self.pos), |
1a4d82fc JJ |
619 | }); |
620 | debug!("scanning whitespace: {:?}", c); | |
621 | c | |
9cc50fc6 SL |
622 | } |
623 | _ => None, | |
1a4d82fc JJ |
624 | } |
625 | } | |
626 | ||
627 | /// Might return a sugared-doc-attr | |
628 | fn scan_block_comment(&mut self) -> Option<TokenAndSpan> { | |
629 | // block comments starting with "/**" or "/*!" are doc-comments | |
c30ab7b3 SL |
630 | let is_doc_comment = self.ch_is('*') || self.ch_is('!'); |
631 | let start_bpos = self.pos - BytePos(2); | |
1a4d82fc | 632 | |
85aaf69f | 633 | let mut level: isize = 1; |
1a4d82fc JJ |
634 | let mut has_cr = false; |
635 | while level > 0 { | |
636 | if self.is_eof() { | |
637 | let msg = if is_doc_comment { | |
638 | "unterminated block doc-comment" | |
639 | } else { | |
640 | "unterminated block comment" | |
641 | }; | |
c30ab7b3 | 642 | let last_bpos = self.pos; |
92a42be0 | 643 | panic!(self.fatal_span_(start_bpos, last_bpos, msg)); |
1a4d82fc | 644 | } |
c30ab7b3 | 645 | let n = self.ch.unwrap(); |
1a4d82fc JJ |
646 | match n { |
647 | '/' if self.nextch_is('*') => { | |
648 | level += 1; | |
649 | self.bump(); | |
650 | } | |
651 | '*' if self.nextch_is('/') => { | |
652 | level -= 1; | |
653 | self.bump(); | |
654 | } | |
655 | '\r' => { | |
656 | has_cr = true; | |
657 | } | |
9cc50fc6 | 658 | _ => (), |
1a4d82fc JJ |
659 | } |
660 | self.bump(); | |
661 | } | |
662 | ||
663 | self.with_str_from(start_bpos, |string| { | |
664 | // but comments with only "*"s between two "/"s are not | |
665 | let tok = if is_block_doc_comment(string) { | |
666 | let string = if has_cr { | |
9cc50fc6 SL |
667 | self.translate_crlf(start_bpos, |
668 | string, | |
1a4d82fc | 669 | "bare CR not allowed in block doc-comment") |
9cc50fc6 SL |
670 | } else { |
671 | string.into() | |
672 | }; | |
476ff2be | 673 | token::DocComment(Symbol::intern(&string[..])) |
1a4d82fc JJ |
674 | } else { |
675 | token::Comment | |
676 | }; | |
677 | ||
9cc50fc6 | 678 | Some(TokenAndSpan { |
1a4d82fc | 679 | tok: tok, |
c30ab7b3 | 680 | sp: syntax_pos::mk_sp(start_bpos, self.pos), |
1a4d82fc JJ |
681 | }) |
682 | }) | |
683 | } | |
684 | ||
c34b1796 AL |
685 | /// Scan through any digits (base `scan_radix`) or underscores, |
686 | /// and return how many digits there were. | |
687 | /// | |
688 | /// `real_radix` represents the true radix of the number we're | |
689 | /// interested in, and errors will be emitted for any digits | |
690 | /// between `real_radix` and `scan_radix`. | |
691 | fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize { | |
692 | assert!(real_radix <= scan_radix); | |
85aaf69f | 693 | let mut len = 0; |
1a4d82fc | 694 | loop { |
c30ab7b3 | 695 | let c = self.ch; |
9cc50fc6 SL |
696 | if c == Some('_') { |
697 | debug!("skipping a _"); | |
698 | self.bump(); | |
699 | continue; | |
700 | } | |
c34b1796 | 701 | match c.and_then(|cc| cc.to_digit(scan_radix)) { |
1a4d82fc JJ |
702 | Some(_) => { |
703 | debug!("{:?} in scan_digits", c); | |
c34b1796 AL |
704 | // check that the hypothetical digit is actually |
705 | // in range for the true radix | |
706 | if c.unwrap().to_digit(real_radix).is_none() { | |
c30ab7b3 SL |
707 | self.err_span_(self.pos, |
708 | self.next_pos, | |
9cc50fc6 | 709 | &format!("invalid digit for a base {} literal", real_radix)); |
c34b1796 | 710 | } |
1a4d82fc JJ |
711 | len += 1; |
712 | self.bump(); | |
713 | } | |
9cc50fc6 | 714 | _ => return len, |
1a4d82fc | 715 | } |
9cc50fc6 | 716 | } |
1a4d82fc JJ |
717 | } |
718 | ||
719 | /// Lex a LIT_INTEGER or a LIT_FLOAT | |
720 | fn scan_number(&mut self, c: char) -> token::Lit { | |
c1a9b12d | 721 | let num_digits; |
1a4d82fc | 722 | let mut base = 10; |
c30ab7b3 | 723 | let start_bpos = self.pos; |
1a4d82fc JJ |
724 | |
725 | self.bump(); | |
726 | ||
727 | if c == '0' { | |
c30ab7b3 | 728 | match self.ch.unwrap_or('\0') { |
9cc50fc6 SL |
729 | 'b' => { |
730 | self.bump(); | |
731 | base = 2; | |
732 | num_digits = self.scan_digits(2, 10); | |
733 | } | |
734 | 'o' => { | |
735 | self.bump(); | |
736 | base = 8; | |
737 | num_digits = self.scan_digits(8, 10); | |
738 | } | |
739 | 'x' => { | |
740 | self.bump(); | |
741 | base = 16; | |
742 | num_digits = self.scan_digits(16, 16); | |
743 | } | |
1a4d82fc | 744 | '0'...'9' | '_' | '.' => { |
c34b1796 | 745 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
746 | } |
747 | _ => { | |
748 | // just a 0 | |
749 | return token::Integer(self.name_from(start_bpos)); | |
750 | } | |
751 | } | |
752 | } else if c.is_digit(10) { | |
c34b1796 | 753 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
754 | } else { |
755 | num_digits = 0; | |
756 | } | |
757 | ||
758 | if num_digits == 0 { | |
9cc50fc6 | 759 | self.err_span_(start_bpos, |
c30ab7b3 | 760 | self.pos, |
9cc50fc6 | 761 | "no valid digits found for number"); |
476ff2be | 762 | return token::Integer(Symbol::intern("0")); |
1a4d82fc JJ |
763 | } |
764 | ||
765 | // might be a float, but don't be greedy if this is actually an | |
766 | // integer literal followed by field/method access or a range pattern | |
767 | // (`0..2` and `12.foo()`) | |
c30ab7b3 | 768 | if self.ch_is('.') && !self.nextch_is('.') && |
9cc50fc6 SL |
769 | !self.nextch() |
770 | .unwrap_or('\0') | |
771 | .is_xid_start() { | |
1a4d82fc JJ |
772 | // might have stuff after the ., and if it does, it needs to start |
773 | // with a number | |
774 | self.bump(); | |
c30ab7b3 | 775 | if self.ch.unwrap_or('\0').is_digit(10) { |
c34b1796 | 776 | self.scan_digits(10, 10); |
1a4d82fc JJ |
777 | self.scan_float_exponent(); |
778 | } | |
c30ab7b3 SL |
779 | let pos = self.pos; |
780 | self.check_float_base(start_bpos, pos, base); | |
1a4d82fc JJ |
781 | return token::Float(self.name_from(start_bpos)); |
782 | } else { | |
783 | // it might be a float if it has an exponent | |
c30ab7b3 | 784 | if self.ch_is('e') || self.ch_is('E') { |
1a4d82fc | 785 | self.scan_float_exponent(); |
c30ab7b3 SL |
786 | let pos = self.pos; |
787 | self.check_float_base(start_bpos, pos, base); | |
1a4d82fc JJ |
788 | return token::Float(self.name_from(start_bpos)); |
789 | } | |
790 | // but we certainly have an integer! | |
791 | return token::Integer(self.name_from(start_bpos)); | |
792 | } | |
793 | } | |
794 | ||
795 | /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an | |
796 | /// error if too many or too few digits are encountered. | |
9cc50fc6 | 797 | fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool { |
1a4d82fc | 798 | debug!("scanning {} digits until {:?}", n_digits, delim); |
c30ab7b3 | 799 | let start_bpos = self.pos; |
1a4d82fc JJ |
800 | let mut accum_int = 0; |
801 | ||
c34b1796 | 802 | let mut valid = true; |
85aaf69f | 803 | for _ in 0..n_digits { |
1a4d82fc | 804 | if self.is_eof() { |
c30ab7b3 | 805 | let last_bpos = self.pos; |
92a42be0 SL |
806 | panic!(self.fatal_span_(start_bpos, |
807 | last_bpos, | |
808 | "unterminated numeric character escape")); | |
1a4d82fc | 809 | } |
c30ab7b3 SL |
810 | if self.ch_is(delim) { |
811 | let last_bpos = self.pos; | |
9cc50fc6 SL |
812 | self.err_span_(start_bpos, |
813 | last_bpos, | |
814 | "numeric character escape is too short"); | |
c34b1796 | 815 | valid = false; |
1a4d82fc JJ |
816 | break; |
817 | } | |
c30ab7b3 | 818 | let c = self.ch.unwrap_or('\x00'); |
1a4d82fc JJ |
819 | accum_int *= 16; |
820 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
c30ab7b3 SL |
821 | self.err_span_char(self.pos, |
822 | self.next_pos, | |
9cc50fc6 SL |
823 | "invalid character in numeric character escape", |
824 | c); | |
c34b1796 AL |
825 | |
826 | valid = false; | |
1a4d82fc | 827 | 0 |
c34b1796 | 828 | }); |
1a4d82fc JJ |
829 | self.bump(); |
830 | } | |
831 | ||
832 | if below_0x7f_only && accum_int >= 0x80 { | |
833 | self.err_span_(start_bpos, | |
c30ab7b3 | 834 | self.pos, |
9cc50fc6 SL |
835 | "this form of character escape may only be used with characters in \ |
836 | the range [\\x00-\\x7f]"); | |
c34b1796 | 837 | valid = false; |
1a4d82fc JJ |
838 | } |
839 | ||
840 | match char::from_u32(accum_int) { | |
c34b1796 | 841 | Some(_) => valid, |
1a4d82fc | 842 | None => { |
c30ab7b3 | 843 | let last_bpos = self.pos; |
c1a9b12d | 844 | self.err_span_(start_bpos, last_bpos, "invalid numeric character escape"); |
1a4d82fc JJ |
845 | false |
846 | } | |
847 | } | |
848 | } | |
849 | ||
1a4d82fc JJ |
850 | /// Scan for a single (possibly escaped) byte or char |
851 | /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. | |
852 | /// `start` is the position of `first_source_char`, which is already consumed. | |
853 | /// | |
854 | /// Returns true if there was a valid char/byte, false otherwise. | |
9cc50fc6 SL |
855 | fn scan_char_or_byte(&mut self, |
856 | start: BytePos, | |
857 | first_source_char: char, | |
858 | ascii_only: bool, | |
859 | delim: char) | |
860 | -> bool { | |
1a4d82fc JJ |
861 | match first_source_char { |
862 | '\\' => { | |
863 | // '\X' for some X must be a character constant: | |
c30ab7b3 SL |
864 | let escaped = self.ch; |
865 | let escaped_pos = self.pos; | |
1a4d82fc JJ |
866 | self.bump(); |
867 | match escaped { | |
9cc50fc6 | 868 | None => {} // EOF here is an error that will be checked later. |
1a4d82fc JJ |
869 | Some(e) => { |
870 | return match e { | |
871 | 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, | |
872 | 'x' => self.scan_byte_escape(delim, !ascii_only), | |
c1a9b12d | 873 | 'u' => { |
c30ab7b3 | 874 | let valid = if self.ch_is('{') { |
c1a9b12d SL |
875 | self.scan_unicode_escape(delim) && !ascii_only |
876 | } else { | |
c30ab7b3 | 877 | let span = syntax_pos::mk_sp(start, self.pos); |
9cc50fc6 SL |
878 | self.span_diagnostic |
879 | .struct_span_err(span, "incorrect unicode escape sequence") | |
880 | .span_help(span, | |
881 | "format of unicode escape sequences is \ | |
882 | `\\u{…}`") | |
883 | .emit(); | |
c1a9b12d SL |
884 | false |
885 | }; | |
886 | if ascii_only { | |
9cc50fc6 | 887 | self.err_span_(start, |
c30ab7b3 | 888 | self.pos, |
9cc50fc6 SL |
889 | "unicode escape sequences cannot be used as a \ |
890 | byte or in a byte string"); | |
62682a34 | 891 | } |
c1a9b12d SL |
892 | valid |
893 | ||
1a4d82fc JJ |
894 | } |
895 | '\n' if delim == '"' => { | |
896 | self.consume_whitespace(); | |
897 | true | |
9cc50fc6 | 898 | } |
c30ab7b3 | 899 | '\r' if delim == '"' && self.ch_is('\n') => { |
1a4d82fc JJ |
900 | self.consume_whitespace(); |
901 | true | |
902 | } | |
903 | c => { | |
c30ab7b3 | 904 | let pos = self.pos; |
9cc50fc6 | 905 | let mut err = self.struct_err_span_char(escaped_pos, |
c30ab7b3 | 906 | pos, |
9cc50fc6 SL |
907 | if ascii_only { |
908 | "unknown byte escape" | |
909 | } else { | |
910 | "unknown character \ | |
911 | escape" | |
912 | }, | |
913 | c); | |
1a4d82fc | 914 | if e == '\r' { |
c30ab7b3 | 915 | err.span_help(syntax_pos::mk_sp(escaped_pos, pos), |
9cc50fc6 SL |
916 | "this is an isolated carriage return; consider \ |
917 | checking your editor and version control \ | |
918 | settings"); | |
1a4d82fc | 919 | } |
9346a6ac | 920 | if (e == '{' || e == '}') && !ascii_only { |
c30ab7b3 | 921 | err.span_help(syntax_pos::mk_sp(escaped_pos, pos), |
9cc50fc6 SL |
922 | "if used in a formatting string, curly braces \ |
923 | are escaped with `{{` and `}}`"); | |
9346a6ac | 924 | } |
9cc50fc6 | 925 | err.emit(); |
1a4d82fc JJ |
926 | false |
927 | } | |
928 | } | |
929 | } | |
930 | } | |
931 | } | |
932 | '\t' | '\n' | '\r' | '\'' if delim == '\'' => { | |
c30ab7b3 | 933 | let pos = self.pos; |
9cc50fc6 | 934 | self.err_span_char(start, |
c30ab7b3 | 935 | pos, |
9cc50fc6 SL |
936 | if ascii_only { |
937 | "byte constant must be escaped" | |
938 | } else { | |
939 | "character constant must be escaped" | |
940 | }, | |
941 | first_source_char); | |
1a4d82fc JJ |
942 | return false; |
943 | } | |
944 | '\r' => { | |
c30ab7b3 | 945 | if self.ch_is('\n') { |
1a4d82fc JJ |
946 | self.bump(); |
947 | return true; | |
948 | } else { | |
9cc50fc6 | 949 | self.err_span_(start, |
c30ab7b3 | 950 | self.pos, |
1a4d82fc JJ |
951 | "bare CR not allowed in string, use \\r instead"); |
952 | return false; | |
953 | } | |
954 | } | |
9cc50fc6 SL |
955 | _ => { |
956 | if ascii_only && first_source_char > '\x7F' { | |
c30ab7b3 | 957 | let pos = self.pos; |
a7813a04 | 958 | self.err_span_(start, |
c30ab7b3 | 959 | pos, |
a7813a04 XL |
960 | "byte constant must be ASCII. Use a \\xHH escape for a \ |
961 | non-ASCII byte"); | |
9cc50fc6 SL |
962 | return false; |
963 | } | |
1a4d82fc JJ |
964 | } |
965 | } | |
966 | true | |
967 | } | |
968 | ||
969 | /// Scan over a \u{...} escape | |
970 | /// | |
971 | /// At this point, we have already seen the \ and the u, the { is the current character. We | |
972 | /// will read at least one digit, and up to 6, and pass over the }. | |
973 | fn scan_unicode_escape(&mut self, delim: char) -> bool { | |
974 | self.bump(); // past the { | |
c30ab7b3 | 975 | let start_bpos = self.pos; |
85aaf69f | 976 | let mut count = 0; |
1a4d82fc | 977 | let mut accum_int = 0; |
c34b1796 | 978 | let mut valid = true; |
1a4d82fc | 979 | |
c30ab7b3 SL |
980 | while !self.ch_is('}') && count <= 6 { |
981 | let c = match self.ch { | |
1a4d82fc JJ |
982 | Some(c) => c, |
983 | None => { | |
9cc50fc6 | 984 | panic!(self.fatal_span_(start_bpos, |
c30ab7b3 | 985 | self.pos, |
92a42be0 | 986 | "unterminated unicode escape (found EOF)")); |
1a4d82fc JJ |
987 | } |
988 | }; | |
989 | accum_int *= 16; | |
990 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
991 | if c == delim { | |
c30ab7b3 SL |
992 | panic!(self.fatal_span_(self.pos, |
993 | self.next_pos, | |
92a42be0 | 994 | "unterminated unicode escape (needed a `}`)")); |
1a4d82fc | 995 | } else { |
c30ab7b3 SL |
996 | self.err_span_char(self.pos, |
997 | self.next_pos, | |
9cc50fc6 SL |
998 | "invalid character in unicode escape", |
999 | c); | |
1a4d82fc | 1000 | } |
c34b1796 AL |
1001 | valid = false; |
1002 | 0 | |
1003 | }); | |
1a4d82fc JJ |
1004 | self.bump(); |
1005 | count += 1; | |
1006 | } | |
1007 | ||
1008 | if count > 6 { | |
9cc50fc6 | 1009 | self.err_span_(start_bpos, |
c30ab7b3 | 1010 | self.pos, |
9cc50fc6 | 1011 | "overlong unicode escape (can have at most 6 hex digits)"); |
c34b1796 | 1012 | valid = false; |
1a4d82fc JJ |
1013 | } |
1014 | ||
c34b1796 | 1015 | if valid && (char::from_u32(accum_int).is_none() || count == 0) { |
9cc50fc6 | 1016 | self.err_span_(start_bpos, |
c30ab7b3 | 1017 | self.pos, |
9cc50fc6 | 1018 | "invalid unicode character escape"); |
62682a34 | 1019 | valid = false; |
1a4d82fc JJ |
1020 | } |
1021 | ||
c1a9b12d | 1022 | self.bump(); // past the ending } |
1a4d82fc JJ |
1023 | valid |
1024 | } | |
1025 | ||
1026 | /// Scan over a float exponent. | |
1027 | fn scan_float_exponent(&mut self) { | |
c30ab7b3 | 1028 | if self.ch_is('e') || self.ch_is('E') { |
1a4d82fc | 1029 | self.bump(); |
c30ab7b3 | 1030 | if self.ch_is('-') || self.ch_is('+') { |
1a4d82fc JJ |
1031 | self.bump(); |
1032 | } | |
c34b1796 | 1033 | if self.scan_digits(10, 10) == 0 { |
c30ab7b3 SL |
1034 | self.err_span_(self.pos, |
1035 | self.next_pos, | |
9cc50fc6 | 1036 | "expected at least one digit in exponent") |
1a4d82fc JJ |
1037 | } |
1038 | } | |
1039 | } | |
1040 | ||
1041 | /// Check that a base is valid for a floating literal, emitting a nice | |
1042 | /// error if it isn't. | |
85aaf69f | 1043 | fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) { |
1a4d82fc | 1044 | match base { |
9cc50fc6 SL |
1045 | 16 => { |
1046 | self.err_span_(start_bpos, | |
1047 | last_bpos, | |
1048 | "hexadecimal float literal is not supported") | |
1049 | } | |
1050 | 8 => { | |
1051 | self.err_span_(start_bpos, | |
1052 | last_bpos, | |
1053 | "octal float literal is not supported") | |
1054 | } | |
1055 | 2 => { | |
1056 | self.err_span_(start_bpos, | |
1057 | last_bpos, | |
1058 | "binary float literal is not supported") | |
1059 | } | |
1060 | _ => (), | |
1a4d82fc JJ |
1061 | } |
1062 | } | |
1063 | ||
1064 | fn binop(&mut self, op: token::BinOpToken) -> token::Token { | |
1065 | self.bump(); | |
c30ab7b3 | 1066 | if self.ch_is('=') { |
1a4d82fc JJ |
1067 | self.bump(); |
1068 | return token::BinOpEq(op); | |
1069 | } else { | |
1070 | return token::BinOp(op); | |
1071 | } | |
1072 | } | |
1073 | ||
1074 | /// Return the next token from the string, advances the input past that | |
1075 | /// token, and updates the interner | |
a7813a04 | 1076 | fn next_token_inner(&mut self) -> Result<token::Token, ()> { |
c30ab7b3 | 1077 | let c = self.ch; |
9cc50fc6 SL |
1078 | if ident_start(c) && |
1079 | match (c.unwrap(), self.nextch(), self.nextnextch()) { | |
1a4d82fc JJ |
1080 | // Note: r as in r" or r#" is part of a raw string literal, |
1081 | // b as in b' is part of a byte literal. | |
1082 | // They are not identifiers, and are handled further down. | |
9cc50fc6 SL |
1083 | ('r', Some('"'), _) | |
1084 | ('r', Some('#'), _) | | |
1085 | ('b', Some('"'), _) | | |
1086 | ('b', Some('\''), _) | | |
1087 | ('b', Some('r'), Some('"')) | | |
1088 | ('b', Some('r'), Some('#')) => false, | |
1089 | _ => true, | |
1a4d82fc | 1090 | } { |
c30ab7b3 SL |
1091 | let start = self.pos; |
1092 | while ident_continue(self.ch) { | |
1a4d82fc JJ |
1093 | self.bump(); |
1094 | } | |
1095 | ||
a7813a04 | 1096 | return Ok(self.with_str_from(start, |string| { |
1a4d82fc JJ |
1097 | if string == "_" { |
1098 | token::Underscore | |
1099 | } else { | |
1100 | // FIXME: perform NFKC normalization here. (Issue #2253) | |
476ff2be | 1101 | token::Ident(Ident::from_str(string)) |
1a4d82fc | 1102 | } |
a7813a04 | 1103 | })); |
1a4d82fc JJ |
1104 | } |
1105 | ||
1106 | if is_dec_digit(c) { | |
1107 | let num = self.scan_number(c.unwrap()); | |
1108 | let suffix = self.scan_optional_raw_name(); | |
1109 | debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix); | |
a7813a04 | 1110 | return Ok(token::Literal(num, suffix)); |
1a4d82fc JJ |
1111 | } |
1112 | ||
1a4d82fc | 1113 | match c.expect("next_token_inner called at EOF") { |
9cc50fc6 SL |
1114 | // One-byte tokens. |
1115 | ';' => { | |
1a4d82fc | 1116 | self.bump(); |
a7813a04 | 1117 | return Ok(token::Semi); |
9cc50fc6 SL |
1118 | } |
1119 | ',' => { | |
1120 | self.bump(); | |
a7813a04 | 1121 | return Ok(token::Comma); |
9cc50fc6 SL |
1122 | } |
1123 | '.' => { | |
1124 | self.bump(); | |
c30ab7b3 | 1125 | return if self.ch_is('.') { |
9cc50fc6 | 1126 | self.bump(); |
c30ab7b3 | 1127 | if self.ch_is('.') { |
9cc50fc6 | 1128 | self.bump(); |
a7813a04 | 1129 | Ok(token::DotDotDot) |
9cc50fc6 | 1130 | } else { |
a7813a04 | 1131 | Ok(token::DotDot) |
9cc50fc6 SL |
1132 | } |
1133 | } else { | |
a7813a04 | 1134 | Ok(token::Dot) |
9cc50fc6 SL |
1135 | }; |
1136 | } | |
1137 | '(' => { | |
1138 | self.bump(); | |
a7813a04 | 1139 | return Ok(token::OpenDelim(token::Paren)); |
9cc50fc6 SL |
1140 | } |
1141 | ')' => { | |
1142 | self.bump(); | |
a7813a04 | 1143 | return Ok(token::CloseDelim(token::Paren)); |
9cc50fc6 SL |
1144 | } |
1145 | '{' => { | |
1146 | self.bump(); | |
a7813a04 | 1147 | return Ok(token::OpenDelim(token::Brace)); |
9cc50fc6 SL |
1148 | } |
1149 | '}' => { | |
1150 | self.bump(); | |
a7813a04 | 1151 | return Ok(token::CloseDelim(token::Brace)); |
9cc50fc6 SL |
1152 | } |
1153 | '[' => { | |
1154 | self.bump(); | |
a7813a04 | 1155 | return Ok(token::OpenDelim(token::Bracket)); |
9cc50fc6 SL |
1156 | } |
1157 | ']' => { | |
1158 | self.bump(); | |
a7813a04 | 1159 | return Ok(token::CloseDelim(token::Bracket)); |
9cc50fc6 SL |
1160 | } |
1161 | '@' => { | |
1162 | self.bump(); | |
a7813a04 | 1163 | return Ok(token::At); |
9cc50fc6 SL |
1164 | } |
1165 | '#' => { | |
1166 | self.bump(); | |
a7813a04 | 1167 | return Ok(token::Pound); |
9cc50fc6 SL |
1168 | } |
1169 | '~' => { | |
1170 | self.bump(); | |
a7813a04 | 1171 | return Ok(token::Tilde); |
9cc50fc6 SL |
1172 | } |
1173 | '?' => { | |
1174 | self.bump(); | |
a7813a04 | 1175 | return Ok(token::Question); |
9cc50fc6 SL |
1176 | } |
1177 | ':' => { | |
1178 | self.bump(); | |
c30ab7b3 | 1179 | if self.ch_is(':') { |
9cc50fc6 | 1180 | self.bump(); |
a7813a04 | 1181 | return Ok(token::ModSep); |
9cc50fc6 | 1182 | } else { |
a7813a04 | 1183 | return Ok(token::Colon); |
9cc50fc6 | 1184 | } |
1a4d82fc | 1185 | } |
1a4d82fc | 1186 | |
9cc50fc6 SL |
1187 | '$' => { |
1188 | self.bump(); | |
a7813a04 | 1189 | return Ok(token::Dollar); |
9cc50fc6 | 1190 | } |
1a4d82fc | 1191 | |
9cc50fc6 SL |
1192 | // Multi-byte tokens. |
1193 | '=' => { | |
1a4d82fc | 1194 | self.bump(); |
c30ab7b3 | 1195 | if self.ch_is('=') { |
9cc50fc6 | 1196 | self.bump(); |
a7813a04 | 1197 | return Ok(token::EqEq); |
c30ab7b3 | 1198 | } else if self.ch_is('>') { |
9cc50fc6 | 1199 | self.bump(); |
a7813a04 | 1200 | return Ok(token::FatArrow); |
9cc50fc6 | 1201 | } else { |
a7813a04 | 1202 | return Ok(token::Eq); |
9cc50fc6 SL |
1203 | } |
1204 | } | |
1205 | '!' => { | |
1a4d82fc | 1206 | self.bump(); |
c30ab7b3 | 1207 | if self.ch_is('=') { |
9cc50fc6 | 1208 | self.bump(); |
a7813a04 | 1209 | return Ok(token::Ne); |
9cc50fc6 | 1210 | } else { |
a7813a04 | 1211 | return Ok(token::Not); |
9cc50fc6 | 1212 | } |
1a4d82fc | 1213 | } |
9cc50fc6 | 1214 | '<' => { |
1a4d82fc | 1215 | self.bump(); |
c30ab7b3 | 1216 | match self.ch.unwrap_or('\x00') { |
9cc50fc6 SL |
1217 | '=' => { |
1218 | self.bump(); | |
a7813a04 | 1219 | return Ok(token::Le); |
9cc50fc6 SL |
1220 | } |
1221 | '<' => { | |
a7813a04 | 1222 | return Ok(self.binop(token::Shl)); |
9cc50fc6 SL |
1223 | } |
1224 | '-' => { | |
1225 | self.bump(); | |
c30ab7b3 | 1226 | match self.ch.unwrap_or('\x00') { |
9cc50fc6 | 1227 | _ => { |
a7813a04 | 1228 | return Ok(token::LArrow); |
9cc50fc6 SL |
1229 | } |
1230 | } | |
1231 | } | |
1232 | _ => { | |
a7813a04 | 1233 | return Ok(token::Lt); |
9cc50fc6 SL |
1234 | } |
1235 | } | |
1236 | } | |
1237 | '>' => { | |
1a4d82fc | 1238 | self.bump(); |
c30ab7b3 | 1239 | match self.ch.unwrap_or('\x00') { |
9cc50fc6 SL |
1240 | '=' => { |
1241 | self.bump(); | |
a7813a04 | 1242 | return Ok(token::Ge); |
9cc50fc6 SL |
1243 | } |
1244 | '>' => { | |
a7813a04 | 1245 | return Ok(self.binop(token::Shr)); |
9cc50fc6 SL |
1246 | } |
1247 | _ => { | |
a7813a04 | 1248 | return Ok(token::Gt); |
9cc50fc6 | 1249 | } |
1a4d82fc | 1250 | } |
1a4d82fc | 1251 | } |
9cc50fc6 SL |
1252 | '\'' => { |
1253 | // Either a character constant 'a' OR a lifetime name 'abc | |
c30ab7b3 | 1254 | let start_with_quote = self.pos; |
9cc50fc6 | 1255 | self.bump(); |
c30ab7b3 | 1256 | let start = self.pos; |
1a4d82fc | 1257 | |
9cc50fc6 | 1258 | // the eof will be picked up by the final `'` check below |
c30ab7b3 | 1259 | let c2 = self.ch.unwrap_or('\x00'); |
9cc50fc6 | 1260 | self.bump(); |
1a4d82fc | 1261 | |
9cc50fc6 SL |
1262 | // If the character is an ident start not followed by another single |
1263 | // quote, then this is a lifetime name: | |
c30ab7b3 SL |
1264 | if ident_start(Some(c2)) && !self.ch_is('\'') { |
1265 | while ident_continue(self.ch) { | |
9cc50fc6 SL |
1266 | self.bump(); |
1267 | } | |
1268 | // lifetimes shouldn't end with a single quote | |
1269 | // if we find one, then this is an invalid character literal | |
c30ab7b3 | 1270 | if self.ch_is('\'') { |
9cc50fc6 | 1271 | panic!(self.fatal_span_verbose( |
c30ab7b3 | 1272 | start_with_quote, self.next_pos, |
9cc50fc6 | 1273 | String::from("character literal may only contain one codepoint"))); |
1a4d82fc | 1274 | |
9cc50fc6 | 1275 | } |
1a4d82fc | 1276 | |
9cc50fc6 SL |
1277 | // Include the leading `'` in the real identifier, for macro |
1278 | // expansion purposes. See #12512 for the gory details of why | |
1279 | // this is necessary. | |
1280 | let ident = self.with_str_from(start, |lifetime_name| { | |
476ff2be | 1281 | Ident::from_str(&format!("'{}", lifetime_name)) |
9cc50fc6 SL |
1282 | }); |
1283 | ||
1284 | // Conjure up a "keyword checking ident" to make sure that | |
1285 | // the lifetime name is not a keyword. | |
1286 | let keyword_checking_ident = self.with_str_from(start, |lifetime_name| { | |
476ff2be | 1287 | Ident::from_str(lifetime_name) |
1a4d82fc | 1288 | }); |
a7813a04 | 1289 | let keyword_checking_token = &token::Ident(keyword_checking_ident); |
c30ab7b3 | 1290 | let last_bpos = self.pos; |
a7813a04 XL |
1291 | if keyword_checking_token.is_any_keyword() && |
1292 | !keyword_checking_token.is_keyword(keywords::Static) { | |
1293 | self.err_span_(start, last_bpos, "lifetimes cannot use keyword names"); | |
9cc50fc6 SL |
1294 | } |
1295 | ||
a7813a04 | 1296 | return Ok(token::Lifetime(ident)); |
1a4d82fc | 1297 | } |
1a4d82fc | 1298 | |
9cc50fc6 SL |
1299 | let valid = self.scan_char_or_byte(start, |
1300 | c2, | |
1301 | // ascii_only = | |
1302 | false, | |
1303 | '\''); | |
92a42be0 | 1304 | |
c30ab7b3 | 1305 | if !self.ch_is('\'') { |
9cc50fc6 | 1306 | panic!(self.fatal_span_verbose( |
c30ab7b3 | 1307 | start_with_quote, self.pos, |
9cc50fc6 | 1308 | String::from("character literal may only contain one codepoint"))); |
1a4d82fc JJ |
1309 | } |
1310 | ||
9cc50fc6 SL |
1311 | let id = if valid { |
1312 | self.name_from(start) | |
1313 | } else { | |
476ff2be | 1314 | Symbol::intern("0") |
9cc50fc6 | 1315 | }; |
c30ab7b3 | 1316 | self.bump(); // advance ch past token |
9cc50fc6 | 1317 | let suffix = self.scan_optional_raw_name(); |
a7813a04 | 1318 | return Ok(token::Literal(token::Char(id), suffix)); |
1a4d82fc | 1319 | } |
9cc50fc6 | 1320 | 'b' => { |
1a4d82fc | 1321 | self.bump(); |
c30ab7b3 | 1322 | let lit = match self.ch { |
9cc50fc6 SL |
1323 | Some('\'') => self.scan_byte(), |
1324 | Some('"') => self.scan_byte_string(), | |
1325 | Some('r') => self.scan_raw_byte_string(), | |
1326 | _ => unreachable!(), // Should have been a token::Ident above. | |
1327 | }; | |
1328 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1329 | return Ok(token::Literal(lit, suffix)); |
1a4d82fc | 1330 | } |
9cc50fc6 | 1331 | '"' => { |
c30ab7b3 | 1332 | let start_bpos = self.pos; |
9cc50fc6 SL |
1333 | let mut valid = true; |
1334 | self.bump(); | |
c30ab7b3 | 1335 | while !self.ch_is('"') { |
9cc50fc6 | 1336 | if self.is_eof() { |
c30ab7b3 | 1337 | let last_bpos = self.pos; |
9cc50fc6 SL |
1338 | panic!(self.fatal_span_(start_bpos, |
1339 | last_bpos, | |
1340 | "unterminated double quote string")); | |
1341 | } | |
1a4d82fc | 1342 | |
c30ab7b3 SL |
1343 | let ch_start = self.pos; |
1344 | let ch = self.ch.unwrap(); | |
9cc50fc6 SL |
1345 | self.bump(); |
1346 | valid &= self.scan_char_or_byte(ch_start, | |
1347 | ch, | |
1348 | // ascii_only = | |
1349 | false, | |
1350 | '"'); | |
1351 | } | |
1352 | // adjust for the ASCII " at the start of the literal | |
1353 | let id = if valid { | |
1354 | self.name_from(start_bpos + BytePos(1)) | |
1355 | } else { | |
476ff2be | 1356 | Symbol::intern("??") |
9cc50fc6 SL |
1357 | }; |
1358 | self.bump(); | |
1359 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1360 | return Ok(token::Literal(token::Str_(id), suffix)); |
1a4d82fc | 1361 | } |
9cc50fc6 | 1362 | 'r' => { |
c30ab7b3 | 1363 | let start_bpos = self.pos; |
9cc50fc6 SL |
1364 | self.bump(); |
1365 | let mut hash_count = 0; | |
c30ab7b3 | 1366 | while self.ch_is('#') { |
9cc50fc6 SL |
1367 | self.bump(); |
1368 | hash_count += 1; | |
1369 | } | |
1370 | ||
1a4d82fc | 1371 | if self.is_eof() { |
c30ab7b3 | 1372 | let last_bpos = self.pos; |
92a42be0 | 1373 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); |
c30ab7b3 SL |
1374 | } else if !self.ch_is('"') { |
1375 | let last_bpos = self.pos; | |
1376 | let curr_char = self.ch.unwrap(); | |
9cc50fc6 SL |
1377 | panic!(self.fatal_span_char(start_bpos, |
1378 | last_bpos, | |
1379 | "found invalid character; only `#` is allowed \ | |
1380 | in raw string delimitation", | |
1381 | curr_char)); | |
1a4d82fc | 1382 | } |
9cc50fc6 | 1383 | self.bump(); |
c30ab7b3 | 1384 | let content_start_bpos = self.pos; |
9cc50fc6 SL |
1385 | let mut content_end_bpos; |
1386 | let mut valid = true; | |
1387 | 'outer: loop { | |
1388 | if self.is_eof() { | |
c30ab7b3 | 1389 | let last_bpos = self.pos; |
9cc50fc6 SL |
1390 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); |
1391 | } | |
c30ab7b3 SL |
1392 | // if self.ch_is('"') { |
1393 | // content_end_bpos = self.pos; | |
9cc50fc6 SL |
1394 | // for _ in 0..hash_count { |
1395 | // self.bump(); | |
c30ab7b3 | 1396 | // if !self.ch_is('#') { |
9cc50fc6 | 1397 | // continue 'outer; |
c30ab7b3 | 1398 | let c = self.ch.unwrap(); |
9cc50fc6 SL |
1399 | match c { |
1400 | '"' => { | |
c30ab7b3 | 1401 | content_end_bpos = self.pos; |
9cc50fc6 SL |
1402 | for _ in 0..hash_count { |
1403 | self.bump(); | |
c30ab7b3 | 1404 | if !self.ch_is('#') { |
9cc50fc6 SL |
1405 | continue 'outer; |
1406 | } | |
1a4d82fc | 1407 | } |
9cc50fc6 | 1408 | break; |
1a4d82fc | 1409 | } |
9cc50fc6 SL |
1410 | '\r' => { |
1411 | if !self.nextch_is('\n') { | |
c30ab7b3 | 1412 | let last_bpos = self.pos; |
9cc50fc6 SL |
1413 | self.err_span_(start_bpos, |
1414 | last_bpos, | |
1415 | "bare CR not allowed in raw string, use \\r \ | |
1416 | instead"); | |
1417 | valid = false; | |
1418 | } | |
1a4d82fc | 1419 | } |
9cc50fc6 | 1420 | _ => (), |
1a4d82fc | 1421 | } |
9cc50fc6 | 1422 | self.bump(); |
1a4d82fc JJ |
1423 | } |
1424 | self.bump(); | |
9cc50fc6 SL |
1425 | let id = if valid { |
1426 | self.name_from_to(content_start_bpos, content_end_bpos) | |
1427 | } else { | |
476ff2be | 1428 | Symbol::intern("??") |
9cc50fc6 SL |
1429 | }; |
1430 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1431 | return Ok(token::Literal(token::StrRaw(id, hash_count), suffix)); |
9cc50fc6 SL |
1432 | } |
1433 | '-' => { | |
1434 | if self.nextch_is('>') { | |
1435 | self.bump(); | |
1436 | self.bump(); | |
a7813a04 | 1437 | return Ok(token::RArrow); |
9cc50fc6 | 1438 | } else { |
a7813a04 | 1439 | return Ok(self.binop(token::Minus)); |
9cc50fc6 SL |
1440 | } |
1441 | } | |
1442 | '&' => { | |
1443 | if self.nextch_is('&') { | |
1444 | self.bump(); | |
1445 | self.bump(); | |
a7813a04 | 1446 | return Ok(token::AndAnd); |
9cc50fc6 | 1447 | } else { |
a7813a04 | 1448 | return Ok(self.binop(token::And)); |
9cc50fc6 SL |
1449 | } |
1450 | } | |
1451 | '|' => { | |
1452 | match self.nextch() { | |
1453 | Some('|') => { | |
1454 | self.bump(); | |
1455 | self.bump(); | |
a7813a04 | 1456 | return Ok(token::OrOr); |
9cc50fc6 SL |
1457 | } |
1458 | _ => { | |
a7813a04 | 1459 | return Ok(self.binop(token::Or)); |
9cc50fc6 SL |
1460 | } |
1461 | } | |
1462 | } | |
1463 | '+' => { | |
a7813a04 | 1464 | return Ok(self.binop(token::Plus)); |
9cc50fc6 SL |
1465 | } |
1466 | '*' => { | |
a7813a04 | 1467 | return Ok(self.binop(token::Star)); |
9cc50fc6 SL |
1468 | } |
1469 | '/' => { | |
a7813a04 | 1470 | return Ok(self.binop(token::Slash)); |
9cc50fc6 SL |
1471 | } |
1472 | '^' => { | |
a7813a04 | 1473 | return Ok(self.binop(token::Caret)); |
9cc50fc6 SL |
1474 | } |
1475 | '%' => { | |
a7813a04 | 1476 | return Ok(self.binop(token::Percent)); |
9cc50fc6 SL |
1477 | } |
1478 | c => { | |
c30ab7b3 SL |
1479 | let last_bpos = self.pos; |
1480 | let bpos = self.next_pos; | |
9cc50fc6 SL |
1481 | let mut err = self.struct_fatal_span_char(last_bpos, |
1482 | bpos, | |
1483 | "unknown start of token", | |
1484 | c); | |
1485 | unicode_chars::check_for_substitution(&self, c, &mut err); | |
a7813a04 XL |
1486 | self.fatal_errs.push(err); |
1487 | Err(()) | |
1a4d82fc | 1488 | } |
1a4d82fc JJ |
1489 | } |
1490 | } | |
1491 | ||
1492 | fn consume_whitespace(&mut self) { | |
c30ab7b3 | 1493 | while is_pattern_whitespace(self.ch) && !self.is_eof() { |
9cc50fc6 SL |
1494 | self.bump(); |
1495 | } | |
1a4d82fc JJ |
1496 | } |
1497 | ||
1498 | fn read_to_eol(&mut self) -> String { | |
1499 | let mut val = String::new(); | |
c30ab7b3 SL |
1500 | while !self.ch_is('\n') && !self.is_eof() { |
1501 | val.push(self.ch.unwrap()); | |
1a4d82fc JJ |
1502 | self.bump(); |
1503 | } | |
c30ab7b3 | 1504 | if self.ch_is('\n') { |
9cc50fc6 SL |
1505 | self.bump(); |
1506 | } | |
1507 | return val; | |
1a4d82fc JJ |
1508 | } |
1509 | ||
1510 | fn read_one_line_comment(&mut self) -> String { | |
1511 | let val = self.read_to_eol(); | |
9cc50fc6 SL |
1512 | assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') || |
1513 | (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!')); | |
1a4d82fc JJ |
1514 | return val; |
1515 | } | |
1516 | ||
1517 | fn consume_non_eol_whitespace(&mut self) { | |
c30ab7b3 | 1518 | while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() { |
1a4d82fc JJ |
1519 | self.bump(); |
1520 | } | |
1521 | } | |
1522 | ||
1523 | fn peeking_at_comment(&self) -> bool { | |
c30ab7b3 | 1524 | (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) || |
9cc50fc6 | 1525 | // consider shebangs comments, but not inner attributes |
c30ab7b3 | 1526 | (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) |
1a4d82fc JJ |
1527 | } |
1528 | ||
1529 | fn scan_byte(&mut self) -> token::Lit { | |
1530 | self.bump(); | |
c30ab7b3 | 1531 | let start = self.pos; |
1a4d82fc JJ |
1532 | |
1533 | // the eof will be picked up by the final `'` check below | |
c30ab7b3 | 1534 | let c2 = self.ch.unwrap_or('\x00'); |
1a4d82fc JJ |
1535 | self.bump(); |
1536 | ||
9cc50fc6 SL |
1537 | let valid = self.scan_char_or_byte(start, |
1538 | c2, | |
1539 | // ascii_only = | |
1540 | true, | |
1541 | '\''); | |
c30ab7b3 | 1542 | if !self.ch_is('\'') { |
1a4d82fc JJ |
1543 | // Byte offsetting here is okay because the |
1544 | // character before position `start` are an | |
1545 | // ascii single quote and ascii 'b'. | |
c30ab7b3 | 1546 | let pos = self.pos; |
9cc50fc6 | 1547 | panic!(self.fatal_span_verbose(start - BytePos(2), |
c30ab7b3 | 1548 | pos, |
9cc50fc6 | 1549 | "unterminated byte constant".to_string())); |
1a4d82fc JJ |
1550 | } |
1551 | ||
9cc50fc6 SL |
1552 | let id = if valid { |
1553 | self.name_from(start) | |
1554 | } else { | |
476ff2be | 1555 | Symbol::intern("?") |
9cc50fc6 | 1556 | }; |
c30ab7b3 | 1557 | self.bump(); // advance ch past token |
1a4d82fc JJ |
1558 | return token::Byte(id); |
1559 | } | |
1560 | ||
1561 | fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool { | |
1562 | self.scan_hex_digits(2, delim, below_0x7f_only) | |
1563 | } | |
1564 | ||
1565 | fn scan_byte_string(&mut self) -> token::Lit { | |
1566 | self.bump(); | |
c30ab7b3 | 1567 | let start = self.pos; |
1a4d82fc JJ |
1568 | let mut valid = true; |
1569 | ||
c30ab7b3 | 1570 | while !self.ch_is('"') { |
1a4d82fc | 1571 | if self.is_eof() { |
c30ab7b3 SL |
1572 | let pos = self.pos; |
1573 | panic!(self.fatal_span_(start, pos, "unterminated double quote byte string")); | |
1a4d82fc JJ |
1574 | } |
1575 | ||
c30ab7b3 SL |
1576 | let ch_start = self.pos; |
1577 | let ch = self.ch.unwrap(); | |
1a4d82fc | 1578 | self.bump(); |
9cc50fc6 SL |
1579 | valid &= self.scan_char_or_byte(ch_start, |
1580 | ch, | |
1581 | // ascii_only = | |
1582 | true, | |
1583 | '"'); | |
1a4d82fc | 1584 | } |
9cc50fc6 SL |
1585 | let id = if valid { |
1586 | self.name_from(start) | |
1587 | } else { | |
476ff2be | 1588 | Symbol::intern("??") |
9cc50fc6 | 1589 | }; |
1a4d82fc | 1590 | self.bump(); |
e9174d1e | 1591 | return token::ByteStr(id); |
1a4d82fc JJ |
1592 | } |
1593 | ||
1594 | fn scan_raw_byte_string(&mut self) -> token::Lit { | |
c30ab7b3 | 1595 | let start_bpos = self.pos; |
1a4d82fc | 1596 | self.bump(); |
85aaf69f | 1597 | let mut hash_count = 0; |
c30ab7b3 | 1598 | while self.ch_is('#') { |
1a4d82fc JJ |
1599 | self.bump(); |
1600 | hash_count += 1; | |
1601 | } | |
1602 | ||
1603 | if self.is_eof() { | |
c30ab7b3 SL |
1604 | let pos = self.pos; |
1605 | panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string")); | |
1606 | } else if !self.ch_is('"') { | |
1607 | let pos = self.pos; | |
1608 | let ch = self.ch.unwrap(); | |
9cc50fc6 | 1609 | panic!(self.fatal_span_char(start_bpos, |
c30ab7b3 | 1610 | pos, |
9cc50fc6 SL |
1611 | "found invalid character; only `#` is allowed in raw \ |
1612 | string delimitation", | |
1613 | ch)); | |
1a4d82fc JJ |
1614 | } |
1615 | self.bump(); | |
c30ab7b3 | 1616 | let content_start_bpos = self.pos; |
1a4d82fc JJ |
1617 | let mut content_end_bpos; |
1618 | 'outer: loop { | |
c30ab7b3 | 1619 | match self.ch { |
1a4d82fc | 1620 | None => { |
c30ab7b3 SL |
1621 | let pos = self.pos; |
1622 | panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string")) | |
9cc50fc6 | 1623 | } |
1a4d82fc | 1624 | Some('"') => { |
c30ab7b3 | 1625 | content_end_bpos = self.pos; |
85aaf69f | 1626 | for _ in 0..hash_count { |
1a4d82fc | 1627 | self.bump(); |
c30ab7b3 | 1628 | if !self.ch_is('#') { |
1a4d82fc JJ |
1629 | continue 'outer; |
1630 | } | |
1631 | } | |
1632 | break; | |
9cc50fc6 SL |
1633 | } |
1634 | Some(c) => { | |
1635 | if c > '\x7F' { | |
c30ab7b3 SL |
1636 | let pos = self.pos; |
1637 | self.err_span_char(pos, pos, "raw byte string must be ASCII", c); | |
9cc50fc6 | 1638 | } |
1a4d82fc JJ |
1639 | } |
1640 | } | |
1641 | self.bump(); | |
1642 | } | |
1643 | self.bump(); | |
9cc50fc6 SL |
1644 | return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos), |
1645 | hash_count); | |
1a4d82fc JJ |
1646 | } |
1647 | } | |
1648 | ||
54a0048b SL |
1649 | // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which |
1650 | // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3 | |
1651 | pub fn is_pattern_whitespace(c: Option<char>) -> bool { | |
1652 | c.map_or(false, Pattern_White_Space) | |
1a4d82fc JJ |
1653 | } |
1654 | ||
1655 | fn in_range(c: Option<char>, lo: char, hi: char) -> bool { | |
1656 | match c { | |
1657 | Some(c) => lo <= c && c <= hi, | |
9cc50fc6 | 1658 | _ => false, |
1a4d82fc JJ |
1659 | } |
1660 | } | |
1661 | ||
9cc50fc6 SL |
1662 | fn is_dec_digit(c: Option<char>) -> bool { |
1663 | return in_range(c, '0', '9'); | |
1664 | } | |
1a4d82fc JJ |
1665 | |
1666 | pub fn is_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1667 | let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') || |
1668 | s.starts_with("//!"); | |
1a4d82fc JJ |
1669 | debug!("is {:?} a doc comment? {}", s, res); |
1670 | res | |
1671 | } | |
1672 | ||
1673 | pub fn is_block_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1674 | // Prevent `/**/` from being parsed as a doc comment |
1675 | let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') || | |
1676 | s.starts_with("/*!")) && s.len() >= 5; | |
1a4d82fc JJ |
1677 | debug!("is {:?} a doc comment? {}", s, res); |
1678 | res | |
1679 | } | |
1680 | ||
1681 | fn ident_start(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1682 | let c = match c { |
1683 | Some(c) => c, | |
1684 | None => return false, | |
1685 | }; | |
1a4d82fc | 1686 | |
9cc50fc6 | 1687 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start()) |
1a4d82fc JJ |
1688 | } |
1689 | ||
1690 | fn ident_continue(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1691 | let c = match c { |
1692 | Some(c) => c, | |
1693 | None => return false, | |
1694 | }; | |
1a4d82fc | 1695 | |
9cc50fc6 SL |
1696 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || |
1697 | (c > '\x7f' && c.is_xid_continue()) | |
1a4d82fc JJ |
1698 | } |
1699 | ||
1700 | #[cfg(test)] | |
d9579d0f | 1701 | mod tests { |
1a4d82fc JJ |
1702 | use super::*; |
1703 | ||
476ff2be SL |
1704 | use ast::Ident; |
1705 | use symbol::Symbol; | |
3157f602 XL |
1706 | use syntax_pos::{BytePos, Span, NO_EXPANSION}; |
1707 | use codemap::CodeMap; | |
9cc50fc6 | 1708 | use errors; |
1a4d82fc | 1709 | use parse::token; |
c34b1796 | 1710 | use std::io; |
9cc50fc6 | 1711 | use std::rc::Rc; |
1a4d82fc | 1712 | |
9cc50fc6 | 1713 | fn mk_sh(cm: Rc<CodeMap>) -> errors::Handler { |
c34b1796 | 1714 | // FIXME (#22405): Replace `Box::new` with `box` here when/if possible. |
3157f602 | 1715 | let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), |
5bcae85e | 1716 | Some(cm)); |
9cc50fc6 | 1717 | errors::Handler::with_emitter(true, false, Box::new(emitter)) |
1a4d82fc JJ |
1718 | } |
1719 | ||
1720 | // open a string reader for the given string | |
9cc50fc6 SL |
1721 | fn setup<'a>(cm: &CodeMap, |
1722 | span_handler: &'a errors::Handler, | |
1723 | teststr: String) | |
1724 | -> StringReader<'a> { | |
3157f602 | 1725 | let fm = cm.new_filemap("zebra.rs".to_string(), None, teststr); |
1a4d82fc JJ |
1726 | StringReader::new(span_handler, fm) |
1727 | } | |
1728 | ||
9cc50fc6 SL |
1729 | #[test] |
1730 | fn t1() { | |
1731 | let cm = Rc::new(CodeMap::new()); | |
1732 | let sh = mk_sh(cm.clone()); | |
1733 | let mut string_reader = setup(&cm, | |
1734 | &sh, | |
1735 | "/* my source file */ fn main() { println!(\"zebra\"); }\n" | |
1736 | .to_string()); | |
476ff2be | 1737 | let id = Ident::from_str("fn"); |
1a4d82fc JJ |
1738 | assert_eq!(string_reader.next_token().tok, token::Comment); |
1739 | assert_eq!(string_reader.next_token().tok, token::Whitespace); | |
1740 | let tok1 = string_reader.next_token(); | |
9cc50fc6 | 1741 | let tok2 = TokenAndSpan { |
a7813a04 | 1742 | tok: token::Ident(id), |
9cc50fc6 SL |
1743 | sp: Span { |
1744 | lo: BytePos(21), | |
1745 | hi: BytePos(23), | |
1746 | expn_id: NO_EXPANSION, | |
1747 | }, | |
1748 | }; | |
1749 | assert_eq!(tok1, tok2); | |
1a4d82fc JJ |
1750 | assert_eq!(string_reader.next_token().tok, token::Whitespace); |
1751 | // the 'main' id is already read: | |
c30ab7b3 | 1752 | assert_eq!(string_reader.pos.clone(), BytePos(28)); |
1a4d82fc JJ |
1753 | // read another token: |
1754 | let tok3 = string_reader.next_token(); | |
9cc50fc6 | 1755 | let tok4 = TokenAndSpan { |
476ff2be | 1756 | tok: token::Ident(Ident::from_str("main")), |
9cc50fc6 SL |
1757 | sp: Span { |
1758 | lo: BytePos(24), | |
1759 | hi: BytePos(28), | |
1760 | expn_id: NO_EXPANSION, | |
1761 | }, | |
1762 | }; | |
1763 | assert_eq!(tok3, tok4); | |
1a4d82fc | 1764 | // the lparen is already read: |
c30ab7b3 | 1765 | assert_eq!(string_reader.pos.clone(), BytePos(29)) |
1a4d82fc JJ |
1766 | } |
1767 | ||
1768 | // check that the given reader produces the desired stream | |
1769 | // of tokens (stop checking after exhausting the expected vec) | |
9cc50fc6 | 1770 | fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) { |
85aaf69f | 1771 | for expected_tok in &expected { |
1a4d82fc JJ |
1772 | assert_eq!(&string_reader.next_token().tok, expected_tok); |
1773 | } | |
1774 | } | |
1775 | ||
1776 | // make the identifier by looking up the string in the interner | |
a7813a04 | 1777 | fn mk_ident(id: &str) -> token::Token { |
476ff2be | 1778 | token::Ident(Ident::from_str(id)) |
1a4d82fc JJ |
1779 | } |
1780 | ||
9cc50fc6 SL |
1781 | #[test] |
1782 | fn doublecolonparsing() { | |
1783 | let cm = Rc::new(CodeMap::new()); | |
1784 | let sh = mk_sh(cm.clone()); | |
1785 | check_tokenization(setup(&cm, &sh, "a b".to_string()), | |
a7813a04 | 1786 | vec![mk_ident("a"), token::Whitespace, mk_ident("b")]); |
1a4d82fc JJ |
1787 | } |
1788 | ||
9cc50fc6 SL |
1789 | #[test] |
1790 | fn dcparsing_2() { | |
1791 | let cm = Rc::new(CodeMap::new()); | |
1792 | let sh = mk_sh(cm.clone()); | |
1793 | check_tokenization(setup(&cm, &sh, "a::b".to_string()), | |
a7813a04 | 1794 | vec![mk_ident("a"), token::ModSep, mk_ident("b")]); |
1a4d82fc JJ |
1795 | } |
1796 | ||
9cc50fc6 SL |
1797 | #[test] |
1798 | fn dcparsing_3() { | |
1799 | let cm = Rc::new(CodeMap::new()); | |
1800 | let sh = mk_sh(cm.clone()); | |
1801 | check_tokenization(setup(&cm, &sh, "a ::b".to_string()), | |
a7813a04 | 1802 | vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]); |
1a4d82fc JJ |
1803 | } |
1804 | ||
9cc50fc6 SL |
1805 | #[test] |
1806 | fn dcparsing_4() { | |
1807 | let cm = Rc::new(CodeMap::new()); | |
1808 | let sh = mk_sh(cm.clone()); | |
1809 | check_tokenization(setup(&cm, &sh, "a:: b".to_string()), | |
a7813a04 | 1810 | vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]); |
1a4d82fc JJ |
1811 | } |
1812 | ||
9cc50fc6 SL |
1813 | #[test] |
1814 | fn character_a() { | |
1815 | let cm = Rc::new(CodeMap::new()); | |
1816 | let sh = mk_sh(cm.clone()); | |
1817 | assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok, | |
476ff2be | 1818 | token::Literal(token::Char(Symbol::intern("a")), None)); |
1a4d82fc JJ |
1819 | } |
1820 | ||
9cc50fc6 SL |
1821 | #[test] |
1822 | fn character_space() { | |
1823 | let cm = Rc::new(CodeMap::new()); | |
1824 | let sh = mk_sh(cm.clone()); | |
1825 | assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok, | |
476ff2be | 1826 | token::Literal(token::Char(Symbol::intern(" ")), None)); |
1a4d82fc JJ |
1827 | } |
1828 | ||
9cc50fc6 SL |
1829 | #[test] |
1830 | fn character_escaped() { | |
1831 | let cm = Rc::new(CodeMap::new()); | |
1832 | let sh = mk_sh(cm.clone()); | |
1833 | assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok, | |
476ff2be | 1834 | token::Literal(token::Char(Symbol::intern("\\n")), None)); |
1a4d82fc JJ |
1835 | } |
1836 | ||
9cc50fc6 SL |
1837 | #[test] |
1838 | fn lifetime_name() { | |
1839 | let cm = Rc::new(CodeMap::new()); | |
1840 | let sh = mk_sh(cm.clone()); | |
1841 | assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok, | |
476ff2be | 1842 | token::Lifetime(Ident::from_str("'abc"))); |
1a4d82fc JJ |
1843 | } |
1844 | ||
9cc50fc6 SL |
1845 | #[test] |
1846 | fn raw_string() { | |
1847 | let cm = Rc::new(CodeMap::new()); | |
1848 | let sh = mk_sh(cm.clone()); | |
1849 | assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string()) | |
1850 | .next_token() | |
1851 | .tok, | |
476ff2be | 1852 | token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None)); |
1a4d82fc JJ |
1853 | } |
1854 | ||
9cc50fc6 SL |
1855 | #[test] |
1856 | fn literal_suffixes() { | |
1857 | let cm = Rc::new(CodeMap::new()); | |
1858 | let sh = mk_sh(cm.clone()); | |
1a4d82fc JJ |
1859 | macro_rules! test { |
1860 | ($input: expr, $tok_type: ident, $tok_contents: expr) => {{ | |
9cc50fc6 | 1861 | assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok, |
476ff2be SL |
1862 | token::Literal(token::$tok_type(Symbol::intern($tok_contents)), |
1863 | Some(Symbol::intern("suffix")))); | |
1a4d82fc | 1864 | // with a whitespace separator: |
9cc50fc6 | 1865 | assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok, |
476ff2be | 1866 | token::Literal(token::$tok_type(Symbol::intern($tok_contents)), |
1a4d82fc JJ |
1867 | None)); |
1868 | }} | |
1869 | } | |
1870 | ||
1871 | test!("'a'", Char, "a"); | |
1872 | test!("b'a'", Byte, "a"); | |
1873 | test!("\"a\"", Str_, "a"); | |
e9174d1e | 1874 | test!("b\"a\"", ByteStr, "a"); |
1a4d82fc JJ |
1875 | test!("1234", Integer, "1234"); |
1876 | test!("0b101", Integer, "0b101"); | |
1877 | test!("0xABC", Integer, "0xABC"); | |
1878 | test!("1.0", Float, "1.0"); | |
1879 | test!("1.0e10", Float, "1.0e10"); | |
1880 | ||
9cc50fc6 | 1881 | assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok, |
476ff2be SL |
1882 | token::Literal(token::Integer(Symbol::intern("2")), |
1883 | Some(Symbol::intern("us")))); | |
9cc50fc6 | 1884 | assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok, |
476ff2be SL |
1885 | token::Literal(token::StrRaw(Symbol::intern("raw"), 3), |
1886 | Some(Symbol::intern("suffix")))); | |
9cc50fc6 | 1887 | assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok, |
476ff2be SL |
1888 | token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3), |
1889 | Some(Symbol::intern("suffix")))); | |
1a4d82fc JJ |
1890 | } |
1891 | ||
9cc50fc6 SL |
1892 | #[test] |
1893 | fn line_doc_comments() { | |
1a4d82fc JJ |
1894 | assert!(is_doc_comment("///")); |
1895 | assert!(is_doc_comment("/// blah")); | |
1896 | assert!(!is_doc_comment("////")); | |
1897 | } | |
1898 | ||
9cc50fc6 SL |
1899 | #[test] |
1900 | fn nested_block_comments() { | |
1901 | let cm = Rc::new(CodeMap::new()); | |
1902 | let sh = mk_sh(cm.clone()); | |
1903 | let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string()); | |
1a4d82fc | 1904 | match lexer.next_token().tok { |
9cc50fc6 SL |
1905 | token::Comment => {} |
1906 | _ => panic!("expected a comment!"), | |
1a4d82fc | 1907 | } |
9cc50fc6 | 1908 | assert_eq!(lexer.next_token().tok, |
476ff2be | 1909 | token::Literal(token::Char(Symbol::intern("a")), None)); |
1a4d82fc JJ |
1910 | } |
1911 | ||
9cc50fc6 SL |
1912 | #[test] |
1913 | fn crlf_comments() { | |
1914 | let cm = Rc::new(CodeMap::new()); | |
1915 | let sh = mk_sh(cm.clone()); | |
1916 | let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string()); | |
62682a34 SL |
1917 | let comment = lexer.next_token(); |
1918 | assert_eq!(comment.tok, token::Comment); | |
3157f602 | 1919 | assert_eq!(comment.sp, ::syntax_pos::mk_sp(BytePos(0), BytePos(7))); |
62682a34 | 1920 | assert_eq!(lexer.next_token().tok, token::Whitespace); |
9cc50fc6 | 1921 | assert_eq!(lexer.next_token().tok, |
476ff2be | 1922 | token::DocComment(Symbol::intern("/// test"))); |
62682a34 | 1923 | } |
1a4d82fc | 1924 | } |