]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use ast; | |
3157f602 XL |
12 | use syntax_pos::{self, BytePos, CharPos, Pos, Span}; |
13 | use codemap::CodeMap; | |
9cc50fc6 | 14 | use errors::{FatalError, Handler, DiagnosticBuilder}; |
1a4d82fc | 15 | use ext::tt::transcribe::tt_next_token; |
a7813a04 | 16 | use parse::token::{self, keywords, str_to_ident}; |
d9579d0f | 17 | use str::char_at; |
54a0048b | 18 | use rustc_unicode::property::Pattern_White_Space; |
1a4d82fc | 19 | |
d9579d0f | 20 | use std::borrow::Cow; |
1a4d82fc | 21 | use std::char; |
1a4d82fc | 22 | use std::mem::replace; |
1a4d82fc | 23 | use std::rc::Rc; |
1a4d82fc JJ |
24 | |
25 | pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag}; | |
26 | ||
27 | pub mod comments; | |
92a42be0 | 28 | mod unicode_chars; |
1a4d82fc JJ |
29 | |
30 | pub trait Reader { | |
31 | fn is_eof(&self) -> bool; | |
a7813a04 XL |
32 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()>; |
33 | fn next_token(&mut self) -> TokenAndSpan where Self: Sized { | |
34 | let res = self.try_next_token(); | |
35 | self.unwrap_or_abort(res) | |
36 | } | |
1a4d82fc | 37 | /// Report a fatal error with the current span. |
92a42be0 | 38 | fn fatal(&self, &str) -> FatalError; |
1a4d82fc JJ |
39 | /// Report a non-fatal error with the current span. |
40 | fn err(&self, &str); | |
a7813a04 XL |
41 | fn emit_fatal_errors(&mut self); |
42 | fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan { | |
43 | match res { | |
44 | Ok(tok) => tok, | |
45 | Err(_) => { | |
46 | self.emit_fatal_errors(); | |
47 | panic!(FatalError); | |
48 | } | |
49 | } | |
50 | } | |
1a4d82fc JJ |
51 | fn peek(&self) -> TokenAndSpan; |
52 | /// Get a token the parser cares about. | |
a7813a04 XL |
53 | fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> { |
54 | let mut t = self.try_next_token()?; | |
1a4d82fc JJ |
55 | loop { |
56 | match t.tok { | |
57 | token::Whitespace | token::Comment | token::Shebang(_) => { | |
a7813a04 | 58 | t = self.try_next_token()?; |
9cc50fc6 SL |
59 | } |
60 | _ => break, | |
1a4d82fc JJ |
61 | } |
62 | } | |
a7813a04 XL |
63 | Ok(t) |
64 | } | |
65 | fn real_token(&mut self) -> TokenAndSpan { | |
66 | let res = self.try_real_token(); | |
67 | self.unwrap_or_abort(res) | |
1a4d82fc JJ |
68 | } |
69 | } | |
70 | ||
85aaf69f | 71 | #[derive(Clone, PartialEq, Eq, Debug)] |
1a4d82fc JJ |
72 | pub struct TokenAndSpan { |
73 | pub tok: token::Token, | |
74 | pub sp: Span, | |
75 | } | |
76 | ||
77 | pub struct StringReader<'a> { | |
9cc50fc6 | 78 | pub span_diagnostic: &'a Handler, |
1a4d82fc JJ |
79 | /// The absolute offset within the codemap of the next character to read |
80 | pub pos: BytePos, | |
81 | /// The absolute offset within the codemap of the last character read(curr) | |
82 | pub last_pos: BytePos, | |
83 | /// The column of the next character to read | |
84 | pub col: CharPos, | |
85 | /// The last character to be read | |
86 | pub curr: Option<char>, | |
3157f602 | 87 | pub filemap: Rc<syntax_pos::FileMap>, |
9e0c209e SL |
88 | /// If Some, stop reading the source at this position (inclusive). |
89 | pub terminator: Option<BytePos>, | |
90 | /// Whether to record new-lines in filemap. This is only necessary the first | |
91 | /// time a filemap is lexed. If part of a filemap is being re-lexed, this | |
92 | /// should be set to false. | |
93 | pub save_new_lines: bool, | |
9cc50fc6 | 94 | // cached: |
1a4d82fc JJ |
95 | pub peek_tok: token::Token, |
96 | pub peek_span: Span, | |
a7813a04 | 97 | pub fatal_errs: Vec<DiagnosticBuilder<'a>>, |
c34b1796 AL |
98 | // cache a direct reference to the source text, so that we don't have to |
99 | // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time. | |
9cc50fc6 | 100 | source_text: Rc<String>, |
1a4d82fc JJ |
101 | } |
102 | ||
103 | impl<'a> Reader for StringReader<'a> { | |
9cc50fc6 | 104 | fn is_eof(&self) -> bool { |
9e0c209e SL |
105 | if self.curr.is_none() { |
106 | return true; | |
107 | } | |
108 | ||
109 | match self.terminator { | |
110 | Some(t) => self.pos > t, | |
111 | None => false, | |
112 | } | |
9cc50fc6 | 113 | } |
1a4d82fc | 114 | /// Return the next token. EFFECT: advances the string_reader. |
a7813a04 XL |
115 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> { |
116 | assert!(self.fatal_errs.is_empty()); | |
1a4d82fc JJ |
117 | let ret_val = TokenAndSpan { |
118 | tok: replace(&mut self.peek_tok, token::Underscore), | |
119 | sp: self.peek_span, | |
120 | }; | |
a7813a04 XL |
121 | self.advance_token()?; |
122 | Ok(ret_val) | |
1a4d82fc | 123 | } |
92a42be0 | 124 | fn fatal(&self, m: &str) -> FatalError { |
1a4d82fc JJ |
125 | self.fatal_span(self.peek_span, m) |
126 | } | |
127 | fn err(&self, m: &str) { | |
128 | self.err_span(self.peek_span, m) | |
129 | } | |
a7813a04 XL |
130 | fn emit_fatal_errors(&mut self) { |
131 | for err in &mut self.fatal_errs { | |
132 | err.emit(); | |
133 | } | |
134 | self.fatal_errs.clear(); | |
135 | } | |
1a4d82fc JJ |
136 | fn peek(&self) -> TokenAndSpan { |
137 | // FIXME(pcwalton): Bad copy! | |
138 | TokenAndSpan { | |
139 | tok: self.peek_tok.clone(), | |
140 | sp: self.peek_span, | |
141 | } | |
142 | } | |
143 | } | |
144 | ||
145 | impl<'a> Reader for TtReader<'a> { | |
146 | fn is_eof(&self) -> bool { | |
147 | self.cur_tok == token::Eof | |
148 | } | |
a7813a04 XL |
149 | fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> { |
150 | assert!(self.fatal_errs.is_empty()); | |
1a4d82fc JJ |
151 | let r = tt_next_token(self); |
152 | debug!("TtReader: r={:?}", r); | |
a7813a04 | 153 | Ok(r) |
1a4d82fc | 154 | } |
92a42be0 SL |
155 | fn fatal(&self, m: &str) -> FatalError { |
156 | self.sp_diag.span_fatal(self.cur_span, m) | |
1a4d82fc JJ |
157 | } |
158 | fn err(&self, m: &str) { | |
159 | self.sp_diag.span_err(self.cur_span, m); | |
160 | } | |
a7813a04 XL |
161 | fn emit_fatal_errors(&mut self) { |
162 | for err in &mut self.fatal_errs { | |
163 | err.emit(); | |
164 | } | |
165 | self.fatal_errs.clear(); | |
166 | } | |
1a4d82fc JJ |
167 | fn peek(&self) -> TokenAndSpan { |
168 | TokenAndSpan { | |
169 | tok: self.cur_tok.clone(), | |
170 | sp: self.cur_span, | |
171 | } | |
172 | } | |
173 | } | |
174 | ||
1a4d82fc JJ |
175 | impl<'a> StringReader<'a> { |
176 | /// For comments.rs, which hackily pokes into pos and curr | |
9cc50fc6 | 177 | pub fn new_raw<'b>(span_diagnostic: &'b Handler, |
3157f602 | 178 | filemap: Rc<syntax_pos::FileMap>) |
9cc50fc6 | 179 | -> StringReader<'b> { |
9e0c209e SL |
180 | let mut sr = StringReader::new_raw_internal(span_diagnostic, filemap); |
181 | sr.bump(); | |
182 | sr | |
183 | } | |
184 | ||
185 | fn new_raw_internal<'b>(span_diagnostic: &'b Handler, | |
186 | filemap: Rc<syntax_pos::FileMap>) | |
187 | -> StringReader<'b> { | |
c34b1796 | 188 | if filemap.src.is_none() { |
9cc50fc6 SL |
189 | span_diagnostic.bug(&format!("Cannot lex filemap \ |
190 | without source: {}", | |
191 | filemap.name)[..]); | |
c34b1796 AL |
192 | } |
193 | ||
194 | let source_text = (*filemap.src.as_ref().unwrap()).clone(); | |
195 | ||
9e0c209e | 196 | StringReader { |
1a4d82fc JJ |
197 | span_diagnostic: span_diagnostic, |
198 | pos: filemap.start_pos, | |
199 | last_pos: filemap.start_pos, | |
200 | col: CharPos(0), | |
201 | curr: Some('\n'), | |
202 | filemap: filemap, | |
9e0c209e SL |
203 | terminator: None, |
204 | save_new_lines: true, | |
9cc50fc6 | 205 | // dummy values; not read |
1a4d82fc | 206 | peek_tok: token::Eof, |
3157f602 | 207 | peek_span: syntax_pos::DUMMY_SP, |
9cc50fc6 | 208 | source_text: source_text, |
a7813a04 | 209 | fatal_errs: Vec::new(), |
9e0c209e | 210 | } |
1a4d82fc JJ |
211 | } |
212 | ||
9cc50fc6 | 213 | pub fn new<'b>(span_diagnostic: &'b Handler, |
3157f602 | 214 | filemap: Rc<syntax_pos::FileMap>) |
9cc50fc6 | 215 | -> StringReader<'b> { |
1a4d82fc | 216 | let mut sr = StringReader::new_raw(span_diagnostic, filemap); |
a7813a04 XL |
217 | if let Err(_) = sr.advance_token() { |
218 | sr.emit_fatal_errors(); | |
219 | panic!(FatalError); | |
220 | } | |
1a4d82fc JJ |
221 | sr |
222 | } | |
223 | ||
224 | pub fn curr_is(&self, c: char) -> bool { | |
225 | self.curr == Some(c) | |
226 | } | |
227 | ||
228 | /// Report a fatal lexical error with a given span. | |
92a42be0 SL |
229 | pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError { |
230 | self.span_diagnostic.span_fatal(sp, m) | |
1a4d82fc JJ |
231 | } |
232 | ||
233 | /// Report a lexical error with a given span. | |
234 | pub fn err_span(&self, sp: Span, m: &str) { | |
235 | self.span_diagnostic.span_err(sp, m) | |
236 | } | |
237 | ||
c1a9b12d | 238 | |
1a4d82fc | 239 | /// Report a fatal error spanning [`from_pos`, `to_pos`). |
92a42be0 | 240 | fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError { |
3157f602 | 241 | self.fatal_span(syntax_pos::mk_sp(from_pos, to_pos), m) |
1a4d82fc JJ |
242 | } |
243 | ||
244 | /// Report a lexical error spanning [`from_pos`, `to_pos`). | |
245 | fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) { | |
3157f602 | 246 | self.err_span(syntax_pos::mk_sp(from_pos, to_pos), m) |
1a4d82fc JJ |
247 | } |
248 | ||
249 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
250 | /// escaped character to the error message | |
92a42be0 | 251 | fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError { |
1a4d82fc JJ |
252 | let mut m = m.to_string(); |
253 | m.push_str(": "); | |
9cc50fc6 SL |
254 | for c in c.escape_default() { |
255 | m.push(c) | |
256 | } | |
92a42be0 | 257 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc | 258 | } |
9cc50fc6 SL |
259 | fn struct_fatal_span_char(&self, |
260 | from_pos: BytePos, | |
261 | to_pos: BytePos, | |
262 | m: &str, | |
263 | c: char) | |
264 | -> DiagnosticBuilder<'a> { | |
265 | let mut m = m.to_string(); | |
266 | m.push_str(": "); | |
267 | for c in c.escape_default() { | |
268 | m.push(c) | |
269 | } | |
3157f602 | 270 | self.span_diagnostic.struct_span_fatal(syntax_pos::mk_sp(from_pos, to_pos), &m[..]) |
9cc50fc6 | 271 | } |
1a4d82fc JJ |
272 | |
273 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
274 | /// escaped character to the error message | |
275 | fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { | |
276 | let mut m = m.to_string(); | |
277 | m.push_str(": "); | |
9cc50fc6 SL |
278 | for c in c.escape_default() { |
279 | m.push(c) | |
280 | } | |
85aaf69f | 281 | self.err_span_(from_pos, to_pos, &m[..]); |
1a4d82fc | 282 | } |
9cc50fc6 SL |
283 | fn struct_err_span_char(&self, |
284 | from_pos: BytePos, | |
285 | to_pos: BytePos, | |
286 | m: &str, | |
287 | c: char) | |
288 | -> DiagnosticBuilder<'a> { | |
289 | let mut m = m.to_string(); | |
290 | m.push_str(": "); | |
291 | for c in c.escape_default() { | |
292 | m.push(c) | |
293 | } | |
3157f602 | 294 | self.span_diagnostic.struct_span_err(syntax_pos::mk_sp(from_pos, to_pos), &m[..]) |
9cc50fc6 | 295 | } |
1a4d82fc JJ |
296 | |
297 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the | |
298 | /// offending string to the error message | |
92a42be0 | 299 | fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError { |
1a4d82fc | 300 | m.push_str(": "); |
85aaf69f SL |
301 | let from = self.byte_offset(from_pos).to_usize(); |
302 | let to = self.byte_offset(to_pos).to_usize(); | |
c34b1796 | 303 | m.push_str(&self.source_text[from..to]); |
92a42be0 | 304 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc JJ |
305 | } |
306 | ||
307 | /// Advance peek_tok and peek_span to refer to the next token, and | |
308 | /// possibly update the interner. | |
a7813a04 | 309 | fn advance_token(&mut self) -> Result<(), ()> { |
1a4d82fc JJ |
310 | match self.scan_whitespace_or_comment() { |
311 | Some(comment) => { | |
312 | self.peek_span = comment.sp; | |
313 | self.peek_tok = comment.tok; | |
9cc50fc6 | 314 | } |
1a4d82fc JJ |
315 | None => { |
316 | if self.is_eof() { | |
317 | self.peek_tok = token::Eof; | |
3157f602 | 318 | self.peek_span = syntax_pos::mk_sp(self.filemap.end_pos, self.filemap.end_pos); |
1a4d82fc JJ |
319 | } else { |
320 | let start_bytepos = self.last_pos; | |
a7813a04 | 321 | self.peek_tok = self.next_token_inner()?; |
3157f602 | 322 | self.peek_span = syntax_pos::mk_sp(start_bytepos, self.last_pos); |
1a4d82fc JJ |
323 | }; |
324 | } | |
325 | } | |
a7813a04 | 326 | Ok(()) |
1a4d82fc JJ |
327 | } |
328 | ||
329 | fn byte_offset(&self, pos: BytePos) -> BytePos { | |
330 | (pos - self.filemap.start_pos) | |
331 | } | |
332 | ||
333 | /// Calls `f` with a string slice of the source text spanning from `start` | |
334 | /// up to but excluding `self.last_pos`, meaning the slice does not include | |
335 | /// the character `self.curr`. | |
9cc50fc6 SL |
336 | pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T |
337 | where F: FnOnce(&str) -> T | |
1a4d82fc JJ |
338 | { |
339 | self.with_str_from_to(start, self.last_pos, f) | |
340 | } | |
341 | ||
342 | /// Create a Name from a given offset to the current offset, each | |
343 | /// adjusted 1 towards each other (assumes that on either side there is a | |
344 | /// single-byte delimiter). | |
345 | pub fn name_from(&self, start: BytePos) -> ast::Name { | |
346 | debug!("taking an ident from {:?} to {:?}", start, self.last_pos); | |
347 | self.with_str_from(start, token::intern) | |
348 | } | |
349 | ||
350 | /// As name_from, with an explicit endpoint. | |
351 | pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name { | |
352 | debug!("taking an ident from {:?} to {:?}", start, end); | |
353 | self.with_str_from_to(start, end, token::intern) | |
354 | } | |
355 | ||
356 | /// Calls `f` with a string slice of the source text spanning from `start` | |
357 | /// up to but excluding `end`. | |
9cc50fc6 SL |
358 | fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T |
359 | where F: FnOnce(&str) -> T | |
1a4d82fc | 360 | { |
9cc50fc6 | 361 | f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()]) |
1a4d82fc JJ |
362 | } |
363 | ||
364 | /// Converts CRLF to LF in the given string, raising an error on bare CR. | |
9cc50fc6 | 365 | fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { |
85aaf69f | 366 | let mut i = 0; |
1a4d82fc | 367 | while i < s.len() { |
d9579d0f | 368 | let ch = char_at(s, i); |
c34b1796 | 369 | let next = i + ch.len_utf8(); |
1a4d82fc | 370 | if ch == '\r' { |
d9579d0f AL |
371 | if next < s.len() && char_at(s, next) == '\n' { |
372 | return translate_crlf_(self, start, s, errmsg, i).into(); | |
1a4d82fc JJ |
373 | } |
374 | let pos = start + BytePos(i as u32); | |
375 | let end_pos = start + BytePos(next as u32); | |
376 | self.err_span_(pos, end_pos, errmsg); | |
377 | } | |
378 | i = next; | |
379 | } | |
d9579d0f | 380 | return s.into(); |
1a4d82fc | 381 | |
9cc50fc6 SL |
382 | fn translate_crlf_(rdr: &StringReader, |
383 | start: BytePos, | |
384 | s: &str, | |
385 | errmsg: &str, | |
386 | mut i: usize) | |
387 | -> String { | |
1a4d82fc JJ |
388 | let mut buf = String::with_capacity(s.len()); |
389 | let mut j = 0; | |
390 | while i < s.len() { | |
d9579d0f | 391 | let ch = char_at(s, i); |
c34b1796 | 392 | let next = i + ch.len_utf8(); |
1a4d82fc | 393 | if ch == '\r' { |
9cc50fc6 SL |
394 | if j < i { |
395 | buf.push_str(&s[j..i]); | |
396 | } | |
1a4d82fc | 397 | j = next; |
d9579d0f | 398 | if next >= s.len() || char_at(s, next) != '\n' { |
1a4d82fc JJ |
399 | let pos = start + BytePos(i as u32); |
400 | let end_pos = start + BytePos(next as u32); | |
401 | rdr.err_span_(pos, end_pos, errmsg); | |
402 | } | |
403 | } | |
404 | i = next; | |
405 | } | |
9cc50fc6 SL |
406 | if j < s.len() { |
407 | buf.push_str(&s[j..]); | |
408 | } | |
1a4d82fc JJ |
409 | buf |
410 | } | |
411 | } | |
412 | ||
413 | ||
414 | /// Advance the StringReader by one character. If a newline is | |
415 | /// discovered, add it to the FileMap's list of line start offsets. | |
416 | pub fn bump(&mut self) { | |
417 | self.last_pos = self.pos; | |
85aaf69f | 418 | let current_byte_offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 419 | if current_byte_offset < self.source_text.len() { |
1a4d82fc JJ |
420 | assert!(self.curr.is_some()); |
421 | let last_char = self.curr.unwrap(); | |
d9579d0f | 422 | let ch = char_at(&self.source_text, current_byte_offset); |
c34b1796 AL |
423 | let next = current_byte_offset + ch.len_utf8(); |
424 | let byte_offset_diff = next - current_byte_offset; | |
85aaf69f | 425 | self.pos = self.pos + Pos::from_usize(byte_offset_diff); |
c34b1796 | 426 | self.curr = Some(ch); |
85aaf69f | 427 | self.col = self.col + CharPos(1); |
1a4d82fc | 428 | if last_char == '\n' { |
9e0c209e SL |
429 | if self.save_new_lines { |
430 | self.filemap.next_line(self.last_pos); | |
431 | } | |
85aaf69f | 432 | self.col = CharPos(0); |
1a4d82fc JJ |
433 | } |
434 | ||
435 | if byte_offset_diff > 1 { | |
436 | self.filemap.record_multibyte_char(self.last_pos, byte_offset_diff); | |
437 | } | |
438 | } else { | |
439 | self.curr = None; | |
440 | } | |
441 | } | |
442 | ||
443 | pub fn nextch(&self) -> Option<char> { | |
85aaf69f | 444 | let offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 445 | if offset < self.source_text.len() { |
d9579d0f | 446 | Some(char_at(&self.source_text, offset)) |
1a4d82fc JJ |
447 | } else { |
448 | None | |
449 | } | |
450 | } | |
451 | ||
452 | pub fn nextch_is(&self, c: char) -> bool { | |
453 | self.nextch() == Some(c) | |
454 | } | |
455 | ||
456 | pub fn nextnextch(&self) -> Option<char> { | |
85aaf69f | 457 | let offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 458 | let s = &self.source_text[..]; |
9cc50fc6 SL |
459 | if offset >= s.len() { |
460 | return None; | |
461 | } | |
d9579d0f | 462 | let next = offset + char_at(s, offset).len_utf8(); |
1a4d82fc | 463 | if next < s.len() { |
d9579d0f | 464 | Some(char_at(s, next)) |
1a4d82fc JJ |
465 | } else { |
466 | None | |
467 | } | |
468 | } | |
469 | ||
470 | pub fn nextnextch_is(&self, c: char) -> bool { | |
471 | self.nextnextch() == Some(c) | |
472 | } | |
473 | ||
474 | /// Eats <XID_start><XID_continue>*, if possible. | |
475 | fn scan_optional_raw_name(&mut self) -> Option<ast::Name> { | |
476 | if !ident_start(self.curr) { | |
9cc50fc6 | 477 | return None; |
1a4d82fc JJ |
478 | } |
479 | let start = self.last_pos; | |
480 | while ident_continue(self.curr) { | |
481 | self.bump(); | |
482 | } | |
483 | ||
484 | self.with_str_from(start, |string| { | |
485 | if string == "_" { | |
486 | None | |
487 | } else { | |
488 | Some(token::intern(string)) | |
489 | } | |
490 | }) | |
491 | } | |
492 | ||
493 | /// PRECONDITION: self.curr is not whitespace | |
494 | /// Eats any kind of comment. | |
495 | fn scan_comment(&mut self) -> Option<TokenAndSpan> { | |
3157f602 XL |
496 | if let Some(c) = self.curr { |
497 | if c.is_whitespace() { | |
498 | self.span_diagnostic.span_err(syntax_pos::mk_sp(self.last_pos, self.last_pos), | |
499 | "called consume_any_line_comment, but there \ | |
500 | was whitespace"); | |
9cc50fc6 | 501 | } |
1a4d82fc JJ |
502 | } |
503 | ||
504 | if self.curr_is('/') { | |
505 | match self.nextch() { | |
506 | Some('/') => { | |
507 | self.bump(); | |
508 | self.bump(); | |
62682a34 | 509 | |
1a4d82fc | 510 | // line comments starting with "///" or "//!" are doc-comments |
62682a34 SL |
511 | let doc_comment = self.curr_is('/') || self.curr_is('!'); |
512 | let start_bpos = if doc_comment { | |
513 | self.pos - BytePos(3) | |
514 | } else { | |
515 | self.last_pos - BytePos(2) | |
516 | }; | |
517 | ||
518 | while !self.is_eof() { | |
519 | match self.curr.unwrap() { | |
520 | '\n' => break, | |
521 | '\r' => { | |
522 | if self.nextch_is('\n') { | |
523 | // CRLF | |
9cc50fc6 | 524 | break; |
62682a34 | 525 | } else if doc_comment { |
9cc50fc6 SL |
526 | self.err_span_(self.last_pos, |
527 | self.pos, | |
62682a34 | 528 | "bare CR not allowed in doc-comment"); |
1a4d82fc | 529 | } |
1a4d82fc | 530 | } |
9cc50fc6 | 531 | _ => (), |
1a4d82fc | 532 | } |
62682a34 SL |
533 | self.bump(); |
534 | } | |
535 | ||
536 | return if doc_comment { | |
537 | self.with_str_from(start_bpos, |string| { | |
538 | // comments with only more "/"s are not doc comments | |
1a4d82fc JJ |
539 | let tok = if is_doc_comment(string) { |
540 | token::DocComment(token::intern(string)) | |
541 | } else { | |
542 | token::Comment | |
543 | }; | |
544 | ||
62682a34 | 545 | Some(TokenAndSpan { |
1a4d82fc | 546 | tok: tok, |
3157f602 | 547 | sp: syntax_pos::mk_sp(start_bpos, self.last_pos), |
62682a34 SL |
548 | }) |
549 | }) | |
1a4d82fc | 550 | } else { |
62682a34 | 551 | Some(TokenAndSpan { |
1a4d82fc | 552 | tok: token::Comment, |
3157f602 | 553 | sp: syntax_pos::mk_sp(start_bpos, self.last_pos), |
62682a34 | 554 | }) |
9cc50fc6 | 555 | }; |
1a4d82fc JJ |
556 | } |
557 | Some('*') => { | |
9cc50fc6 SL |
558 | self.bump(); |
559 | self.bump(); | |
1a4d82fc JJ |
560 | self.scan_block_comment() |
561 | } | |
9cc50fc6 | 562 | _ => None, |
1a4d82fc JJ |
563 | } |
564 | } else if self.curr_is('#') { | |
565 | if self.nextch_is('!') { | |
566 | ||
567 | // Parse an inner attribute. | |
568 | if self.nextnextch_is('[') { | |
569 | return None; | |
570 | } | |
571 | ||
572 | // I guess this is the only way to figure out if | |
573 | // we're at the beginning of the file... | |
574 | let cmap = CodeMap::new(); | |
575 | cmap.files.borrow_mut().push(self.filemap.clone()); | |
576 | let loc = cmap.lookup_char_pos_adj(self.last_pos); | |
577 | debug!("Skipping a shebang"); | |
85aaf69f | 578 | if loc.line == 1 && loc.col == CharPos(0) { |
1a4d82fc JJ |
579 | // FIXME: Add shebang "token", return it |
580 | let start = self.last_pos; | |
9cc50fc6 SL |
581 | while !self.curr_is('\n') && !self.is_eof() { |
582 | self.bump(); | |
583 | } | |
1a4d82fc JJ |
584 | return Some(TokenAndSpan { |
585 | tok: token::Shebang(self.name_from(start)), | |
3157f602 | 586 | sp: syntax_pos::mk_sp(start, self.last_pos), |
1a4d82fc JJ |
587 | }); |
588 | } | |
589 | } | |
590 | None | |
591 | } else { | |
592 | None | |
593 | } | |
594 | } | |
595 | ||
596 | /// If there is whitespace, shebang, or a comment, scan it. Otherwise, | |
597 | /// return None. | |
598 | fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> { | |
599 | match self.curr.unwrap_or('\0') { | |
600 | // # to handle shebang at start of file -- this is the entry point | |
601 | // for skipping over all "junk" | |
602 | '/' | '#' => { | |
603 | let c = self.scan_comment(); | |
604 | debug!("scanning a comment {:?}", c); | |
605 | c | |
54a0048b SL |
606 | }, |
607 | c if is_pattern_whitespace(Some(c)) => { | |
1a4d82fc | 608 | let start_bpos = self.last_pos; |
54a0048b | 609 | while is_pattern_whitespace(self.curr) { |
9cc50fc6 SL |
610 | self.bump(); |
611 | } | |
1a4d82fc JJ |
612 | let c = Some(TokenAndSpan { |
613 | tok: token::Whitespace, | |
3157f602 | 614 | sp: syntax_pos::mk_sp(start_bpos, self.last_pos), |
1a4d82fc JJ |
615 | }); |
616 | debug!("scanning whitespace: {:?}", c); | |
617 | c | |
9cc50fc6 SL |
618 | } |
619 | _ => None, | |
1a4d82fc JJ |
620 | } |
621 | } | |
622 | ||
623 | /// Might return a sugared-doc-attr | |
624 | fn scan_block_comment(&mut self) -> Option<TokenAndSpan> { | |
625 | // block comments starting with "/**" or "/*!" are doc-comments | |
626 | let is_doc_comment = self.curr_is('*') || self.curr_is('!'); | |
627 | let start_bpos = self.last_pos - BytePos(2); | |
628 | ||
85aaf69f | 629 | let mut level: isize = 1; |
1a4d82fc JJ |
630 | let mut has_cr = false; |
631 | while level > 0 { | |
632 | if self.is_eof() { | |
633 | let msg = if is_doc_comment { | |
634 | "unterminated block doc-comment" | |
635 | } else { | |
636 | "unterminated block comment" | |
637 | }; | |
638 | let last_bpos = self.last_pos; | |
92a42be0 | 639 | panic!(self.fatal_span_(start_bpos, last_bpos, msg)); |
1a4d82fc JJ |
640 | } |
641 | let n = self.curr.unwrap(); | |
642 | match n { | |
643 | '/' if self.nextch_is('*') => { | |
644 | level += 1; | |
645 | self.bump(); | |
646 | } | |
647 | '*' if self.nextch_is('/') => { | |
648 | level -= 1; | |
649 | self.bump(); | |
650 | } | |
651 | '\r' => { | |
652 | has_cr = true; | |
653 | } | |
9cc50fc6 | 654 | _ => (), |
1a4d82fc JJ |
655 | } |
656 | self.bump(); | |
657 | } | |
658 | ||
659 | self.with_str_from(start_bpos, |string| { | |
660 | // but comments with only "*"s between two "/"s are not | |
661 | let tok = if is_block_doc_comment(string) { | |
662 | let string = if has_cr { | |
9cc50fc6 SL |
663 | self.translate_crlf(start_bpos, |
664 | string, | |
1a4d82fc | 665 | "bare CR not allowed in block doc-comment") |
9cc50fc6 SL |
666 | } else { |
667 | string.into() | |
668 | }; | |
85aaf69f | 669 | token::DocComment(token::intern(&string[..])) |
1a4d82fc JJ |
670 | } else { |
671 | token::Comment | |
672 | }; | |
673 | ||
9cc50fc6 | 674 | Some(TokenAndSpan { |
1a4d82fc | 675 | tok: tok, |
3157f602 | 676 | sp: syntax_pos::mk_sp(start_bpos, self.last_pos), |
1a4d82fc JJ |
677 | }) |
678 | }) | |
679 | } | |
680 | ||
c34b1796 AL |
681 | /// Scan through any digits (base `scan_radix`) or underscores, |
682 | /// and return how many digits there were. | |
683 | /// | |
684 | /// `real_radix` represents the true radix of the number we're | |
685 | /// interested in, and errors will be emitted for any digits | |
686 | /// between `real_radix` and `scan_radix`. | |
687 | fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize { | |
688 | assert!(real_radix <= scan_radix); | |
85aaf69f | 689 | let mut len = 0; |
1a4d82fc JJ |
690 | loop { |
691 | let c = self.curr; | |
9cc50fc6 SL |
692 | if c == Some('_') { |
693 | debug!("skipping a _"); | |
694 | self.bump(); | |
695 | continue; | |
696 | } | |
c34b1796 | 697 | match c.and_then(|cc| cc.to_digit(scan_radix)) { |
1a4d82fc JJ |
698 | Some(_) => { |
699 | debug!("{:?} in scan_digits", c); | |
c34b1796 AL |
700 | // check that the hypothetical digit is actually |
701 | // in range for the true radix | |
702 | if c.unwrap().to_digit(real_radix).is_none() { | |
9cc50fc6 SL |
703 | self.err_span_(self.last_pos, |
704 | self.pos, | |
705 | &format!("invalid digit for a base {} literal", real_radix)); | |
c34b1796 | 706 | } |
1a4d82fc JJ |
707 | len += 1; |
708 | self.bump(); | |
709 | } | |
9cc50fc6 | 710 | _ => return len, |
1a4d82fc | 711 | } |
9cc50fc6 | 712 | } |
1a4d82fc JJ |
713 | } |
714 | ||
715 | /// Lex a LIT_INTEGER or a LIT_FLOAT | |
716 | fn scan_number(&mut self, c: char) -> token::Lit { | |
c1a9b12d | 717 | let num_digits; |
1a4d82fc JJ |
718 | let mut base = 10; |
719 | let start_bpos = self.last_pos; | |
720 | ||
721 | self.bump(); | |
722 | ||
723 | if c == '0' { | |
724 | match self.curr.unwrap_or('\0') { | |
9cc50fc6 SL |
725 | 'b' => { |
726 | self.bump(); | |
727 | base = 2; | |
728 | num_digits = self.scan_digits(2, 10); | |
729 | } | |
730 | 'o' => { | |
731 | self.bump(); | |
732 | base = 8; | |
733 | num_digits = self.scan_digits(8, 10); | |
734 | } | |
735 | 'x' => { | |
736 | self.bump(); | |
737 | base = 16; | |
738 | num_digits = self.scan_digits(16, 16); | |
739 | } | |
1a4d82fc | 740 | '0'...'9' | '_' | '.' => { |
c34b1796 | 741 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
742 | } |
743 | _ => { | |
744 | // just a 0 | |
745 | return token::Integer(self.name_from(start_bpos)); | |
746 | } | |
747 | } | |
748 | } else if c.is_digit(10) { | |
c34b1796 | 749 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
750 | } else { |
751 | num_digits = 0; | |
752 | } | |
753 | ||
754 | if num_digits == 0 { | |
9cc50fc6 SL |
755 | self.err_span_(start_bpos, |
756 | self.last_pos, | |
757 | "no valid digits found for number"); | |
1a4d82fc JJ |
758 | return token::Integer(token::intern("0")); |
759 | } | |
760 | ||
761 | // might be a float, but don't be greedy if this is actually an | |
762 | // integer literal followed by field/method access or a range pattern | |
763 | // (`0..2` and `12.foo()`) | |
9cc50fc6 SL |
764 | if self.curr_is('.') && !self.nextch_is('.') && |
765 | !self.nextch() | |
766 | .unwrap_or('\0') | |
767 | .is_xid_start() { | |
1a4d82fc JJ |
768 | // might have stuff after the ., and if it does, it needs to start |
769 | // with a number | |
770 | self.bump(); | |
771 | if self.curr.unwrap_or('\0').is_digit(10) { | |
c34b1796 | 772 | self.scan_digits(10, 10); |
1a4d82fc JJ |
773 | self.scan_float_exponent(); |
774 | } | |
775 | let last_pos = self.last_pos; | |
776 | self.check_float_base(start_bpos, last_pos, base); | |
777 | return token::Float(self.name_from(start_bpos)); | |
778 | } else { | |
779 | // it might be a float if it has an exponent | |
780 | if self.curr_is('e') || self.curr_is('E') { | |
781 | self.scan_float_exponent(); | |
782 | let last_pos = self.last_pos; | |
783 | self.check_float_base(start_bpos, last_pos, base); | |
784 | return token::Float(self.name_from(start_bpos)); | |
785 | } | |
786 | // but we certainly have an integer! | |
787 | return token::Integer(self.name_from(start_bpos)); | |
788 | } | |
789 | } | |
790 | ||
791 | /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an | |
792 | /// error if too many or too few digits are encountered. | |
9cc50fc6 | 793 | fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool { |
1a4d82fc JJ |
794 | debug!("scanning {} digits until {:?}", n_digits, delim); |
795 | let start_bpos = self.last_pos; | |
796 | let mut accum_int = 0; | |
797 | ||
c34b1796 | 798 | let mut valid = true; |
85aaf69f | 799 | for _ in 0..n_digits { |
1a4d82fc JJ |
800 | if self.is_eof() { |
801 | let last_bpos = self.last_pos; | |
92a42be0 SL |
802 | panic!(self.fatal_span_(start_bpos, |
803 | last_bpos, | |
804 | "unterminated numeric character escape")); | |
1a4d82fc JJ |
805 | } |
806 | if self.curr_is(delim) { | |
807 | let last_bpos = self.last_pos; | |
9cc50fc6 SL |
808 | self.err_span_(start_bpos, |
809 | last_bpos, | |
810 | "numeric character escape is too short"); | |
c34b1796 | 811 | valid = false; |
1a4d82fc JJ |
812 | break; |
813 | } | |
814 | let c = self.curr.unwrap_or('\x00'); | |
815 | accum_int *= 16; | |
816 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
9cc50fc6 SL |
817 | self.err_span_char(self.last_pos, |
818 | self.pos, | |
819 | "invalid character in numeric character escape", | |
820 | c); | |
c34b1796 AL |
821 | |
822 | valid = false; | |
1a4d82fc | 823 | 0 |
c34b1796 | 824 | }); |
1a4d82fc JJ |
825 | self.bump(); |
826 | } | |
827 | ||
828 | if below_0x7f_only && accum_int >= 0x80 { | |
829 | self.err_span_(start_bpos, | |
830 | self.last_pos, | |
9cc50fc6 SL |
831 | "this form of character escape may only be used with characters in \ |
832 | the range [\\x00-\\x7f]"); | |
c34b1796 | 833 | valid = false; |
1a4d82fc JJ |
834 | } |
835 | ||
836 | match char::from_u32(accum_int) { | |
c34b1796 | 837 | Some(_) => valid, |
1a4d82fc JJ |
838 | None => { |
839 | let last_bpos = self.last_pos; | |
c1a9b12d | 840 | self.err_span_(start_bpos, last_bpos, "invalid numeric character escape"); |
1a4d82fc JJ |
841 | false |
842 | } | |
843 | } | |
844 | } | |
845 | ||
1a4d82fc JJ |
846 | /// Scan for a single (possibly escaped) byte or char |
847 | /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. | |
848 | /// `start` is the position of `first_source_char`, which is already consumed. | |
849 | /// | |
850 | /// Returns true if there was a valid char/byte, false otherwise. | |
9cc50fc6 SL |
851 | fn scan_char_or_byte(&mut self, |
852 | start: BytePos, | |
853 | first_source_char: char, | |
854 | ascii_only: bool, | |
855 | delim: char) | |
856 | -> bool { | |
1a4d82fc JJ |
857 | match first_source_char { |
858 | '\\' => { | |
859 | // '\X' for some X must be a character constant: | |
860 | let escaped = self.curr; | |
861 | let escaped_pos = self.last_pos; | |
862 | self.bump(); | |
863 | match escaped { | |
9cc50fc6 | 864 | None => {} // EOF here is an error that will be checked later. |
1a4d82fc JJ |
865 | Some(e) => { |
866 | return match e { | |
867 | 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, | |
868 | 'x' => self.scan_byte_escape(delim, !ascii_only), | |
c1a9b12d SL |
869 | 'u' => { |
870 | let valid = if self.curr_is('{') { | |
871 | self.scan_unicode_escape(delim) && !ascii_only | |
872 | } else { | |
3157f602 | 873 | let span = syntax_pos::mk_sp(start, self.last_pos); |
9cc50fc6 SL |
874 | self.span_diagnostic |
875 | .struct_span_err(span, "incorrect unicode escape sequence") | |
876 | .span_help(span, | |
877 | "format of unicode escape sequences is \ | |
878 | `\\u{…}`") | |
879 | .emit(); | |
c1a9b12d SL |
880 | false |
881 | }; | |
882 | if ascii_only { | |
9cc50fc6 SL |
883 | self.err_span_(start, |
884 | self.last_pos, | |
885 | "unicode escape sequences cannot be used as a \ | |
886 | byte or in a byte string"); | |
62682a34 | 887 | } |
c1a9b12d SL |
888 | valid |
889 | ||
1a4d82fc JJ |
890 | } |
891 | '\n' if delim == '"' => { | |
892 | self.consume_whitespace(); | |
893 | true | |
9cc50fc6 | 894 | } |
1a4d82fc JJ |
895 | '\r' if delim == '"' && self.curr_is('\n') => { |
896 | self.consume_whitespace(); | |
897 | true | |
898 | } | |
899 | c => { | |
900 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
901 | let mut err = self.struct_err_span_char(escaped_pos, |
902 | last_pos, | |
903 | if ascii_only { | |
904 | "unknown byte escape" | |
905 | } else { | |
906 | "unknown character \ | |
907 | escape" | |
908 | }, | |
909 | c); | |
1a4d82fc | 910 | if e == '\r' { |
3157f602 | 911 | err.span_help(syntax_pos::mk_sp(escaped_pos, last_pos), |
9cc50fc6 SL |
912 | "this is an isolated carriage return; consider \ |
913 | checking your editor and version control \ | |
914 | settings"); | |
1a4d82fc | 915 | } |
9346a6ac | 916 | if (e == '{' || e == '}') && !ascii_only { |
3157f602 | 917 | err.span_help(syntax_pos::mk_sp(escaped_pos, last_pos), |
9cc50fc6 SL |
918 | "if used in a formatting string, curly braces \ |
919 | are escaped with `{{` and `}}`"); | |
9346a6ac | 920 | } |
9cc50fc6 | 921 | err.emit(); |
1a4d82fc JJ |
922 | false |
923 | } | |
924 | } | |
925 | } | |
926 | } | |
927 | } | |
928 | '\t' | '\n' | '\r' | '\'' if delim == '\'' => { | |
929 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
930 | self.err_span_char(start, |
931 | last_pos, | |
932 | if ascii_only { | |
933 | "byte constant must be escaped" | |
934 | } else { | |
935 | "character constant must be escaped" | |
936 | }, | |
937 | first_source_char); | |
1a4d82fc JJ |
938 | return false; |
939 | } | |
940 | '\r' => { | |
941 | if self.curr_is('\n') { | |
942 | self.bump(); | |
943 | return true; | |
944 | } else { | |
9cc50fc6 SL |
945 | self.err_span_(start, |
946 | self.last_pos, | |
1a4d82fc JJ |
947 | "bare CR not allowed in string, use \\r instead"); |
948 | return false; | |
949 | } | |
950 | } | |
9cc50fc6 SL |
951 | _ => { |
952 | if ascii_only && first_source_char > '\x7F' { | |
953 | let last_pos = self.last_pos; | |
a7813a04 XL |
954 | self.err_span_(start, |
955 | last_pos, | |
956 | "byte constant must be ASCII. Use a \\xHH escape for a \ | |
957 | non-ASCII byte"); | |
9cc50fc6 SL |
958 | return false; |
959 | } | |
1a4d82fc JJ |
960 | } |
961 | } | |
962 | true | |
963 | } | |
964 | ||
965 | /// Scan over a \u{...} escape | |
966 | /// | |
967 | /// At this point, we have already seen the \ and the u, the { is the current character. We | |
968 | /// will read at least one digit, and up to 6, and pass over the }. | |
969 | fn scan_unicode_escape(&mut self, delim: char) -> bool { | |
970 | self.bump(); // past the { | |
971 | let start_bpos = self.last_pos; | |
85aaf69f | 972 | let mut count = 0; |
1a4d82fc | 973 | let mut accum_int = 0; |
c34b1796 | 974 | let mut valid = true; |
1a4d82fc JJ |
975 | |
976 | while !self.curr_is('}') && count <= 6 { | |
977 | let c = match self.curr { | |
978 | Some(c) => c, | |
979 | None => { | |
9cc50fc6 SL |
980 | panic!(self.fatal_span_(start_bpos, |
981 | self.last_pos, | |
92a42be0 | 982 | "unterminated unicode escape (found EOF)")); |
1a4d82fc JJ |
983 | } |
984 | }; | |
985 | accum_int *= 16; | |
986 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
987 | if c == delim { | |
9cc50fc6 SL |
988 | panic!(self.fatal_span_(self.last_pos, |
989 | self.pos, | |
92a42be0 | 990 | "unterminated unicode escape (needed a `}`)")); |
1a4d82fc | 991 | } else { |
9cc50fc6 SL |
992 | self.err_span_char(self.last_pos, |
993 | self.pos, | |
994 | "invalid character in unicode escape", | |
995 | c); | |
1a4d82fc | 996 | } |
c34b1796 AL |
997 | valid = false; |
998 | 0 | |
999 | }); | |
1a4d82fc JJ |
1000 | self.bump(); |
1001 | count += 1; | |
1002 | } | |
1003 | ||
1004 | if count > 6 { | |
9cc50fc6 SL |
1005 | self.err_span_(start_bpos, |
1006 | self.last_pos, | |
1007 | "overlong unicode escape (can have at most 6 hex digits)"); | |
c34b1796 | 1008 | valid = false; |
1a4d82fc JJ |
1009 | } |
1010 | ||
c34b1796 | 1011 | if valid && (char::from_u32(accum_int).is_none() || count == 0) { |
9cc50fc6 SL |
1012 | self.err_span_(start_bpos, |
1013 | self.last_pos, | |
1014 | "invalid unicode character escape"); | |
62682a34 | 1015 | valid = false; |
1a4d82fc JJ |
1016 | } |
1017 | ||
c1a9b12d | 1018 | self.bump(); // past the ending } |
1a4d82fc JJ |
1019 | valid |
1020 | } | |
1021 | ||
1022 | /// Scan over a float exponent. | |
1023 | fn scan_float_exponent(&mut self) { | |
1024 | if self.curr_is('e') || self.curr_is('E') { | |
1025 | self.bump(); | |
1026 | if self.curr_is('-') || self.curr_is('+') { | |
1027 | self.bump(); | |
1028 | } | |
c34b1796 | 1029 | if self.scan_digits(10, 10) == 0 { |
9cc50fc6 SL |
1030 | self.err_span_(self.last_pos, |
1031 | self.pos, | |
1032 | "expected at least one digit in exponent") | |
1a4d82fc JJ |
1033 | } |
1034 | } | |
1035 | } | |
1036 | ||
1037 | /// Check that a base is valid for a floating literal, emitting a nice | |
1038 | /// error if it isn't. | |
85aaf69f | 1039 | fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) { |
1a4d82fc | 1040 | match base { |
9cc50fc6 SL |
1041 | 16 => { |
1042 | self.err_span_(start_bpos, | |
1043 | last_bpos, | |
1044 | "hexadecimal float literal is not supported") | |
1045 | } | |
1046 | 8 => { | |
1047 | self.err_span_(start_bpos, | |
1048 | last_bpos, | |
1049 | "octal float literal is not supported") | |
1050 | } | |
1051 | 2 => { | |
1052 | self.err_span_(start_bpos, | |
1053 | last_bpos, | |
1054 | "binary float literal is not supported") | |
1055 | } | |
1056 | _ => (), | |
1a4d82fc JJ |
1057 | } |
1058 | } | |
1059 | ||
1060 | fn binop(&mut self, op: token::BinOpToken) -> token::Token { | |
1061 | self.bump(); | |
1062 | if self.curr_is('=') { | |
1063 | self.bump(); | |
1064 | return token::BinOpEq(op); | |
1065 | } else { | |
1066 | return token::BinOp(op); | |
1067 | } | |
1068 | } | |
1069 | ||
1070 | /// Return the next token from the string, advances the input past that | |
1071 | /// token, and updates the interner | |
a7813a04 | 1072 | fn next_token_inner(&mut self) -> Result<token::Token, ()> { |
1a4d82fc | 1073 | let c = self.curr; |
9cc50fc6 SL |
1074 | if ident_start(c) && |
1075 | match (c.unwrap(), self.nextch(), self.nextnextch()) { | |
1a4d82fc JJ |
1076 | // Note: r as in r" or r#" is part of a raw string literal, |
1077 | // b as in b' is part of a byte literal. | |
1078 | // They are not identifiers, and are handled further down. | |
9cc50fc6 SL |
1079 | ('r', Some('"'), _) | |
1080 | ('r', Some('#'), _) | | |
1081 | ('b', Some('"'), _) | | |
1082 | ('b', Some('\''), _) | | |
1083 | ('b', Some('r'), Some('"')) | | |
1084 | ('b', Some('r'), Some('#')) => false, | |
1085 | _ => true, | |
1a4d82fc JJ |
1086 | } { |
1087 | let start = self.last_pos; | |
1088 | while ident_continue(self.curr) { | |
1089 | self.bump(); | |
1090 | } | |
1091 | ||
a7813a04 | 1092 | return Ok(self.with_str_from(start, |string| { |
1a4d82fc JJ |
1093 | if string == "_" { |
1094 | token::Underscore | |
1095 | } else { | |
1096 | // FIXME: perform NFKC normalization here. (Issue #2253) | |
a7813a04 | 1097 | token::Ident(str_to_ident(string)) |
1a4d82fc | 1098 | } |
a7813a04 | 1099 | })); |
1a4d82fc JJ |
1100 | } |
1101 | ||
1102 | if is_dec_digit(c) { | |
1103 | let num = self.scan_number(c.unwrap()); | |
1104 | let suffix = self.scan_optional_raw_name(); | |
1105 | debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix); | |
a7813a04 | 1106 | return Ok(token::Literal(num, suffix)); |
1a4d82fc JJ |
1107 | } |
1108 | ||
1a4d82fc | 1109 | match c.expect("next_token_inner called at EOF") { |
9cc50fc6 SL |
1110 | // One-byte tokens. |
1111 | ';' => { | |
1a4d82fc | 1112 | self.bump(); |
a7813a04 | 1113 | return Ok(token::Semi); |
9cc50fc6 SL |
1114 | } |
1115 | ',' => { | |
1116 | self.bump(); | |
a7813a04 | 1117 | return Ok(token::Comma); |
9cc50fc6 SL |
1118 | } |
1119 | '.' => { | |
1120 | self.bump(); | |
1121 | return if self.curr_is('.') { | |
1122 | self.bump(); | |
1123 | if self.curr_is('.') { | |
1124 | self.bump(); | |
a7813a04 | 1125 | Ok(token::DotDotDot) |
9cc50fc6 | 1126 | } else { |
a7813a04 | 1127 | Ok(token::DotDot) |
9cc50fc6 SL |
1128 | } |
1129 | } else { | |
a7813a04 | 1130 | Ok(token::Dot) |
9cc50fc6 SL |
1131 | }; |
1132 | } | |
1133 | '(' => { | |
1134 | self.bump(); | |
a7813a04 | 1135 | return Ok(token::OpenDelim(token::Paren)); |
9cc50fc6 SL |
1136 | } |
1137 | ')' => { | |
1138 | self.bump(); | |
a7813a04 | 1139 | return Ok(token::CloseDelim(token::Paren)); |
9cc50fc6 SL |
1140 | } |
1141 | '{' => { | |
1142 | self.bump(); | |
a7813a04 | 1143 | return Ok(token::OpenDelim(token::Brace)); |
9cc50fc6 SL |
1144 | } |
1145 | '}' => { | |
1146 | self.bump(); | |
a7813a04 | 1147 | return Ok(token::CloseDelim(token::Brace)); |
9cc50fc6 SL |
1148 | } |
1149 | '[' => { | |
1150 | self.bump(); | |
a7813a04 | 1151 | return Ok(token::OpenDelim(token::Bracket)); |
9cc50fc6 SL |
1152 | } |
1153 | ']' => { | |
1154 | self.bump(); | |
a7813a04 | 1155 | return Ok(token::CloseDelim(token::Bracket)); |
9cc50fc6 SL |
1156 | } |
1157 | '@' => { | |
1158 | self.bump(); | |
a7813a04 | 1159 | return Ok(token::At); |
9cc50fc6 SL |
1160 | } |
1161 | '#' => { | |
1162 | self.bump(); | |
a7813a04 | 1163 | return Ok(token::Pound); |
9cc50fc6 SL |
1164 | } |
1165 | '~' => { | |
1166 | self.bump(); | |
a7813a04 | 1167 | return Ok(token::Tilde); |
9cc50fc6 SL |
1168 | } |
1169 | '?' => { | |
1170 | self.bump(); | |
a7813a04 | 1171 | return Ok(token::Question); |
9cc50fc6 SL |
1172 | } |
1173 | ':' => { | |
1174 | self.bump(); | |
1175 | if self.curr_is(':') { | |
1176 | self.bump(); | |
a7813a04 | 1177 | return Ok(token::ModSep); |
9cc50fc6 | 1178 | } else { |
a7813a04 | 1179 | return Ok(token::Colon); |
9cc50fc6 | 1180 | } |
1a4d82fc | 1181 | } |
1a4d82fc | 1182 | |
9cc50fc6 SL |
1183 | '$' => { |
1184 | self.bump(); | |
a7813a04 | 1185 | return Ok(token::Dollar); |
9cc50fc6 | 1186 | } |
1a4d82fc | 1187 | |
9cc50fc6 SL |
1188 | // Multi-byte tokens. |
1189 | '=' => { | |
1a4d82fc | 1190 | self.bump(); |
9cc50fc6 SL |
1191 | if self.curr_is('=') { |
1192 | self.bump(); | |
a7813a04 | 1193 | return Ok(token::EqEq); |
9cc50fc6 SL |
1194 | } else if self.curr_is('>') { |
1195 | self.bump(); | |
a7813a04 | 1196 | return Ok(token::FatArrow); |
9cc50fc6 | 1197 | } else { |
a7813a04 | 1198 | return Ok(token::Eq); |
9cc50fc6 SL |
1199 | } |
1200 | } | |
1201 | '!' => { | |
1a4d82fc | 1202 | self.bump(); |
9cc50fc6 SL |
1203 | if self.curr_is('=') { |
1204 | self.bump(); | |
a7813a04 | 1205 | return Ok(token::Ne); |
9cc50fc6 | 1206 | } else { |
a7813a04 | 1207 | return Ok(token::Not); |
9cc50fc6 | 1208 | } |
1a4d82fc | 1209 | } |
9cc50fc6 | 1210 | '<' => { |
1a4d82fc | 1211 | self.bump(); |
9cc50fc6 SL |
1212 | match self.curr.unwrap_or('\x00') { |
1213 | '=' => { | |
1214 | self.bump(); | |
a7813a04 | 1215 | return Ok(token::Le); |
9cc50fc6 SL |
1216 | } |
1217 | '<' => { | |
a7813a04 | 1218 | return Ok(self.binop(token::Shl)); |
9cc50fc6 SL |
1219 | } |
1220 | '-' => { | |
1221 | self.bump(); | |
1222 | match self.curr.unwrap_or('\x00') { | |
1223 | _ => { | |
a7813a04 | 1224 | return Ok(token::LArrow); |
9cc50fc6 SL |
1225 | } |
1226 | } | |
1227 | } | |
1228 | _ => { | |
a7813a04 | 1229 | return Ok(token::Lt); |
9cc50fc6 SL |
1230 | } |
1231 | } | |
1232 | } | |
1233 | '>' => { | |
1a4d82fc JJ |
1234 | self.bump(); |
1235 | match self.curr.unwrap_or('\x00') { | |
9cc50fc6 SL |
1236 | '=' => { |
1237 | self.bump(); | |
a7813a04 | 1238 | return Ok(token::Ge); |
9cc50fc6 SL |
1239 | } |
1240 | '>' => { | |
a7813a04 | 1241 | return Ok(self.binop(token::Shr)); |
9cc50fc6 SL |
1242 | } |
1243 | _ => { | |
a7813a04 | 1244 | return Ok(token::Gt); |
9cc50fc6 | 1245 | } |
1a4d82fc | 1246 | } |
1a4d82fc | 1247 | } |
9cc50fc6 SL |
1248 | '\'' => { |
1249 | // Either a character constant 'a' OR a lifetime name 'abc | |
1250 | let start_with_quote = self.last_pos; | |
1251 | self.bump(); | |
1252 | let start = self.last_pos; | |
1a4d82fc | 1253 | |
9cc50fc6 SL |
1254 | // the eof will be picked up by the final `'` check below |
1255 | let c2 = self.curr.unwrap_or('\x00'); | |
1256 | self.bump(); | |
1a4d82fc | 1257 | |
9cc50fc6 SL |
1258 | // If the character is an ident start not followed by another single |
1259 | // quote, then this is a lifetime name: | |
1260 | if ident_start(Some(c2)) && !self.curr_is('\'') { | |
1261 | while ident_continue(self.curr) { | |
1262 | self.bump(); | |
1263 | } | |
1264 | // lifetimes shouldn't end with a single quote | |
1265 | // if we find one, then this is an invalid character literal | |
1266 | if self.curr_is('\'') { | |
1267 | panic!(self.fatal_span_verbose( | |
1268 | start_with_quote, self.pos, | |
1269 | String::from("character literal may only contain one codepoint"))); | |
1a4d82fc | 1270 | |
9cc50fc6 | 1271 | } |
1a4d82fc | 1272 | |
9cc50fc6 SL |
1273 | // Include the leading `'` in the real identifier, for macro |
1274 | // expansion purposes. See #12512 for the gory details of why | |
1275 | // this is necessary. | |
1276 | let ident = self.with_str_from(start, |lifetime_name| { | |
1277 | str_to_ident(&format!("'{}", lifetime_name)) | |
1278 | }); | |
1279 | ||
1280 | // Conjure up a "keyword checking ident" to make sure that | |
1281 | // the lifetime name is not a keyword. | |
1282 | let keyword_checking_ident = self.with_str_from(start, |lifetime_name| { | |
1a4d82fc JJ |
1283 | str_to_ident(lifetime_name) |
1284 | }); | |
a7813a04 | 1285 | let keyword_checking_token = &token::Ident(keyword_checking_ident); |
9cc50fc6 | 1286 | let last_bpos = self.last_pos; |
a7813a04 XL |
1287 | if keyword_checking_token.is_any_keyword() && |
1288 | !keyword_checking_token.is_keyword(keywords::Static) { | |
1289 | self.err_span_(start, last_bpos, "lifetimes cannot use keyword names"); | |
9cc50fc6 SL |
1290 | } |
1291 | ||
a7813a04 | 1292 | return Ok(token::Lifetime(ident)); |
1a4d82fc | 1293 | } |
1a4d82fc | 1294 | |
9cc50fc6 SL |
1295 | let valid = self.scan_char_or_byte(start, |
1296 | c2, | |
1297 | // ascii_only = | |
1298 | false, | |
1299 | '\''); | |
92a42be0 | 1300 | |
9cc50fc6 SL |
1301 | if !self.curr_is('\'') { |
1302 | panic!(self.fatal_span_verbose( | |
1303 | start_with_quote, self.last_pos, | |
1304 | String::from("character literal may only contain one codepoint"))); | |
1a4d82fc JJ |
1305 | } |
1306 | ||
9cc50fc6 SL |
1307 | let id = if valid { |
1308 | self.name_from(start) | |
1309 | } else { | |
1310 | token::intern("0") | |
1311 | }; | |
1312 | self.bump(); // advance curr past token | |
1313 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1314 | return Ok(token::Literal(token::Char(id), suffix)); |
1a4d82fc | 1315 | } |
9cc50fc6 | 1316 | 'b' => { |
1a4d82fc | 1317 | self.bump(); |
9cc50fc6 SL |
1318 | let lit = match self.curr { |
1319 | Some('\'') => self.scan_byte(), | |
1320 | Some('"') => self.scan_byte_string(), | |
1321 | Some('r') => self.scan_raw_byte_string(), | |
1322 | _ => unreachable!(), // Should have been a token::Ident above. | |
1323 | }; | |
1324 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1325 | return Ok(token::Literal(lit, suffix)); |
1a4d82fc | 1326 | } |
9cc50fc6 SL |
1327 | '"' => { |
1328 | let start_bpos = self.last_pos; | |
1329 | let mut valid = true; | |
1330 | self.bump(); | |
1331 | while !self.curr_is('"') { | |
1332 | if self.is_eof() { | |
1333 | let last_bpos = self.last_pos; | |
1334 | panic!(self.fatal_span_(start_bpos, | |
1335 | last_bpos, | |
1336 | "unterminated double quote string")); | |
1337 | } | |
1a4d82fc | 1338 | |
9cc50fc6 SL |
1339 | let ch_start = self.last_pos; |
1340 | let ch = self.curr.unwrap(); | |
1341 | self.bump(); | |
1342 | valid &= self.scan_char_or_byte(ch_start, | |
1343 | ch, | |
1344 | // ascii_only = | |
1345 | false, | |
1346 | '"'); | |
1347 | } | |
1348 | // adjust for the ASCII " at the start of the literal | |
1349 | let id = if valid { | |
1350 | self.name_from(start_bpos + BytePos(1)) | |
1351 | } else { | |
1352 | token::intern("??") | |
1353 | }; | |
1354 | self.bump(); | |
1355 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1356 | return Ok(token::Literal(token::Str_(id), suffix)); |
1a4d82fc | 1357 | } |
9cc50fc6 SL |
1358 | 'r' => { |
1359 | let start_bpos = self.last_pos; | |
1360 | self.bump(); | |
1361 | let mut hash_count = 0; | |
1362 | while self.curr_is('#') { | |
1363 | self.bump(); | |
1364 | hash_count += 1; | |
1365 | } | |
1366 | ||
1a4d82fc JJ |
1367 | if self.is_eof() { |
1368 | let last_bpos = self.last_pos; | |
92a42be0 | 1369 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); |
9cc50fc6 SL |
1370 | } else if !self.curr_is('"') { |
1371 | let last_bpos = self.last_pos; | |
1372 | let curr_char = self.curr.unwrap(); | |
1373 | panic!(self.fatal_span_char(start_bpos, | |
1374 | last_bpos, | |
1375 | "found invalid character; only `#` is allowed \ | |
1376 | in raw string delimitation", | |
1377 | curr_char)); | |
1a4d82fc | 1378 | } |
9cc50fc6 SL |
1379 | self.bump(); |
1380 | let content_start_bpos = self.last_pos; | |
1381 | let mut content_end_bpos; | |
1382 | let mut valid = true; | |
1383 | 'outer: loop { | |
1384 | if self.is_eof() { | |
1385 | let last_bpos = self.last_pos; | |
1386 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); | |
1387 | } | |
1388 | // if self.curr_is('"') { | |
1389 | // content_end_bpos = self.last_pos; | |
1390 | // for _ in 0..hash_count { | |
1391 | // self.bump(); | |
1392 | // if !self.curr_is('#') { | |
1393 | // continue 'outer; | |
1394 | let c = self.curr.unwrap(); | |
1395 | match c { | |
1396 | '"' => { | |
1397 | content_end_bpos = self.last_pos; | |
1398 | for _ in 0..hash_count { | |
1399 | self.bump(); | |
1400 | if !self.curr_is('#') { | |
1401 | continue 'outer; | |
1402 | } | |
1a4d82fc | 1403 | } |
9cc50fc6 | 1404 | break; |
1a4d82fc | 1405 | } |
9cc50fc6 SL |
1406 | '\r' => { |
1407 | if !self.nextch_is('\n') { | |
1408 | let last_bpos = self.last_pos; | |
1409 | self.err_span_(start_bpos, | |
1410 | last_bpos, | |
1411 | "bare CR not allowed in raw string, use \\r \ | |
1412 | instead"); | |
1413 | valid = false; | |
1414 | } | |
1a4d82fc | 1415 | } |
9cc50fc6 | 1416 | _ => (), |
1a4d82fc | 1417 | } |
9cc50fc6 | 1418 | self.bump(); |
1a4d82fc JJ |
1419 | } |
1420 | self.bump(); | |
9cc50fc6 SL |
1421 | let id = if valid { |
1422 | self.name_from_to(content_start_bpos, content_end_bpos) | |
1423 | } else { | |
1424 | token::intern("??") | |
1425 | }; | |
1426 | let suffix = self.scan_optional_raw_name(); | |
a7813a04 | 1427 | return Ok(token::Literal(token::StrRaw(id, hash_count), suffix)); |
9cc50fc6 SL |
1428 | } |
1429 | '-' => { | |
1430 | if self.nextch_is('>') { | |
1431 | self.bump(); | |
1432 | self.bump(); | |
a7813a04 | 1433 | return Ok(token::RArrow); |
9cc50fc6 | 1434 | } else { |
a7813a04 | 1435 | return Ok(self.binop(token::Minus)); |
9cc50fc6 SL |
1436 | } |
1437 | } | |
1438 | '&' => { | |
1439 | if self.nextch_is('&') { | |
1440 | self.bump(); | |
1441 | self.bump(); | |
a7813a04 | 1442 | return Ok(token::AndAnd); |
9cc50fc6 | 1443 | } else { |
a7813a04 | 1444 | return Ok(self.binop(token::And)); |
9cc50fc6 SL |
1445 | } |
1446 | } | |
1447 | '|' => { | |
1448 | match self.nextch() { | |
1449 | Some('|') => { | |
1450 | self.bump(); | |
1451 | self.bump(); | |
a7813a04 | 1452 | return Ok(token::OrOr); |
9cc50fc6 SL |
1453 | } |
1454 | _ => { | |
a7813a04 | 1455 | return Ok(self.binop(token::Or)); |
9cc50fc6 SL |
1456 | } |
1457 | } | |
1458 | } | |
1459 | '+' => { | |
a7813a04 | 1460 | return Ok(self.binop(token::Plus)); |
9cc50fc6 SL |
1461 | } |
1462 | '*' => { | |
a7813a04 | 1463 | return Ok(self.binop(token::Star)); |
9cc50fc6 SL |
1464 | } |
1465 | '/' => { | |
a7813a04 | 1466 | return Ok(self.binop(token::Slash)); |
9cc50fc6 SL |
1467 | } |
1468 | '^' => { | |
a7813a04 | 1469 | return Ok(self.binop(token::Caret)); |
9cc50fc6 SL |
1470 | } |
1471 | '%' => { | |
a7813a04 | 1472 | return Ok(self.binop(token::Percent)); |
9cc50fc6 SL |
1473 | } |
1474 | c => { | |
1475 | let last_bpos = self.last_pos; | |
1476 | let bpos = self.pos; | |
1477 | let mut err = self.struct_fatal_span_char(last_bpos, | |
1478 | bpos, | |
1479 | "unknown start of token", | |
1480 | c); | |
1481 | unicode_chars::check_for_substitution(&self, c, &mut err); | |
a7813a04 XL |
1482 | self.fatal_errs.push(err); |
1483 | Err(()) | |
1a4d82fc | 1484 | } |
1a4d82fc JJ |
1485 | } |
1486 | } | |
1487 | ||
1488 | fn consume_whitespace(&mut self) { | |
54a0048b | 1489 | while is_pattern_whitespace(self.curr) && !self.is_eof() { |
9cc50fc6 SL |
1490 | self.bump(); |
1491 | } | |
1a4d82fc JJ |
1492 | } |
1493 | ||
1494 | fn read_to_eol(&mut self) -> String { | |
1495 | let mut val = String::new(); | |
1496 | while !self.curr_is('\n') && !self.is_eof() { | |
1497 | val.push(self.curr.unwrap()); | |
1498 | self.bump(); | |
1499 | } | |
9cc50fc6 SL |
1500 | if self.curr_is('\n') { |
1501 | self.bump(); | |
1502 | } | |
1503 | return val; | |
1a4d82fc JJ |
1504 | } |
1505 | ||
1506 | fn read_one_line_comment(&mut self) -> String { | |
1507 | let val = self.read_to_eol(); | |
9cc50fc6 SL |
1508 | assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') || |
1509 | (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!')); | |
1a4d82fc JJ |
1510 | return val; |
1511 | } | |
1512 | ||
1513 | fn consume_non_eol_whitespace(&mut self) { | |
54a0048b | 1514 | while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() { |
1a4d82fc JJ |
1515 | self.bump(); |
1516 | } | |
1517 | } | |
1518 | ||
1519 | fn peeking_at_comment(&self) -> bool { | |
9cc50fc6 SL |
1520 | (self.curr_is('/') && self.nextch_is('/')) || (self.curr_is('/') && self.nextch_is('*')) || |
1521 | // consider shebangs comments, but not inner attributes | |
1522 | (self.curr_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) | |
1a4d82fc JJ |
1523 | } |
1524 | ||
1525 | fn scan_byte(&mut self) -> token::Lit { | |
1526 | self.bump(); | |
1527 | let start = self.last_pos; | |
1528 | ||
1529 | // the eof will be picked up by the final `'` check below | |
1530 | let c2 = self.curr.unwrap_or('\x00'); | |
1531 | self.bump(); | |
1532 | ||
9cc50fc6 SL |
1533 | let valid = self.scan_char_or_byte(start, |
1534 | c2, | |
1535 | // ascii_only = | |
1536 | true, | |
1537 | '\''); | |
1a4d82fc JJ |
1538 | if !self.curr_is('\'') { |
1539 | // Byte offsetting here is okay because the | |
1540 | // character before position `start` are an | |
1541 | // ascii single quote and ascii 'b'. | |
1542 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
1543 | panic!(self.fatal_span_verbose(start - BytePos(2), |
1544 | last_pos, | |
1545 | "unterminated byte constant".to_string())); | |
1a4d82fc JJ |
1546 | } |
1547 | ||
9cc50fc6 SL |
1548 | let id = if valid { |
1549 | self.name_from(start) | |
1550 | } else { | |
1551 | token::intern("?") | |
1552 | }; | |
1a4d82fc JJ |
1553 | self.bump(); // advance curr past token |
1554 | return token::Byte(id); | |
1555 | } | |
1556 | ||
1557 | fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool { | |
1558 | self.scan_hex_digits(2, delim, below_0x7f_only) | |
1559 | } | |
1560 | ||
1561 | fn scan_byte_string(&mut self) -> token::Lit { | |
1562 | self.bump(); | |
1563 | let start = self.last_pos; | |
1564 | let mut valid = true; | |
1565 | ||
1566 | while !self.curr_is('"') { | |
1567 | if self.is_eof() { | |
1568 | let last_pos = self.last_pos; | |
92a42be0 | 1569 | panic!(self.fatal_span_(start, last_pos, "unterminated double quote byte string")); |
1a4d82fc JJ |
1570 | } |
1571 | ||
1572 | let ch_start = self.last_pos; | |
1573 | let ch = self.curr.unwrap(); | |
1574 | self.bump(); | |
9cc50fc6 SL |
1575 | valid &= self.scan_char_or_byte(ch_start, |
1576 | ch, | |
1577 | // ascii_only = | |
1578 | true, | |
1579 | '"'); | |
1a4d82fc | 1580 | } |
9cc50fc6 SL |
1581 | let id = if valid { |
1582 | self.name_from(start) | |
1583 | } else { | |
1584 | token::intern("??") | |
1585 | }; | |
1a4d82fc | 1586 | self.bump(); |
e9174d1e | 1587 | return token::ByteStr(id); |
1a4d82fc JJ |
1588 | } |
1589 | ||
1590 | fn scan_raw_byte_string(&mut self) -> token::Lit { | |
1591 | let start_bpos = self.last_pos; | |
1592 | self.bump(); | |
85aaf69f | 1593 | let mut hash_count = 0; |
1a4d82fc JJ |
1594 | while self.curr_is('#') { |
1595 | self.bump(); | |
1596 | hash_count += 1; | |
1597 | } | |
1598 | ||
1599 | if self.is_eof() { | |
1600 | let last_pos = self.last_pos; | |
92a42be0 | 1601 | panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string")); |
1a4d82fc JJ |
1602 | } else if !self.curr_is('"') { |
1603 | let last_pos = self.last_pos; | |
1604 | let ch = self.curr.unwrap(); | |
9cc50fc6 SL |
1605 | panic!(self.fatal_span_char(start_bpos, |
1606 | last_pos, | |
1607 | "found invalid character; only `#` is allowed in raw \ | |
1608 | string delimitation", | |
1609 | ch)); | |
1a4d82fc JJ |
1610 | } |
1611 | self.bump(); | |
1612 | let content_start_bpos = self.last_pos; | |
1613 | let mut content_end_bpos; | |
1614 | 'outer: loop { | |
1615 | match self.curr { | |
1616 | None => { | |
1617 | let last_pos = self.last_pos; | |
92a42be0 | 1618 | panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string")) |
9cc50fc6 | 1619 | } |
1a4d82fc JJ |
1620 | Some('"') => { |
1621 | content_end_bpos = self.last_pos; | |
85aaf69f | 1622 | for _ in 0..hash_count { |
1a4d82fc JJ |
1623 | self.bump(); |
1624 | if !self.curr_is('#') { | |
1625 | continue 'outer; | |
1626 | } | |
1627 | } | |
1628 | break; | |
9cc50fc6 SL |
1629 | } |
1630 | Some(c) => { | |
1631 | if c > '\x7F' { | |
1632 | let last_pos = self.last_pos; | |
1633 | self.err_span_char(last_pos, last_pos, "raw byte string must be ASCII", c); | |
1634 | } | |
1a4d82fc JJ |
1635 | } |
1636 | } | |
1637 | self.bump(); | |
1638 | } | |
1639 | self.bump(); | |
9cc50fc6 SL |
1640 | return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos), |
1641 | hash_count); | |
1a4d82fc JJ |
1642 | } |
1643 | } | |
1644 | ||
54a0048b SL |
1645 | // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which |
1646 | // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3 | |
1647 | pub fn is_pattern_whitespace(c: Option<char>) -> bool { | |
1648 | c.map_or(false, Pattern_White_Space) | |
1a4d82fc JJ |
1649 | } |
1650 | ||
1651 | fn in_range(c: Option<char>, lo: char, hi: char) -> bool { | |
1652 | match c { | |
1653 | Some(c) => lo <= c && c <= hi, | |
9cc50fc6 | 1654 | _ => false, |
1a4d82fc JJ |
1655 | } |
1656 | } | |
1657 | ||
9cc50fc6 SL |
1658 | fn is_dec_digit(c: Option<char>) -> bool { |
1659 | return in_range(c, '0', '9'); | |
1660 | } | |
1a4d82fc JJ |
1661 | |
1662 | pub fn is_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1663 | let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') || |
1664 | s.starts_with("//!"); | |
1a4d82fc JJ |
1665 | debug!("is {:?} a doc comment? {}", s, res); |
1666 | res | |
1667 | } | |
1668 | ||
1669 | pub fn is_block_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1670 | // Prevent `/**/` from being parsed as a doc comment |
1671 | let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') || | |
1672 | s.starts_with("/*!")) && s.len() >= 5; | |
1a4d82fc JJ |
1673 | debug!("is {:?} a doc comment? {}", s, res); |
1674 | res | |
1675 | } | |
1676 | ||
1677 | fn ident_start(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1678 | let c = match c { |
1679 | Some(c) => c, | |
1680 | None => return false, | |
1681 | }; | |
1a4d82fc | 1682 | |
9cc50fc6 | 1683 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start()) |
1a4d82fc JJ |
1684 | } |
1685 | ||
1686 | fn ident_continue(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1687 | let c = match c { |
1688 | Some(c) => c, | |
1689 | None => return false, | |
1690 | }; | |
1a4d82fc | 1691 | |
9cc50fc6 SL |
1692 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || |
1693 | (c > '\x7f' && c.is_xid_continue()) | |
1a4d82fc JJ |
1694 | } |
1695 | ||
1696 | #[cfg(test)] | |
d9579d0f | 1697 | mod tests { |
1a4d82fc JJ |
1698 | use super::*; |
1699 | ||
3157f602 XL |
1700 | use syntax_pos::{BytePos, Span, NO_EXPANSION}; |
1701 | use codemap::CodeMap; | |
9cc50fc6 | 1702 | use errors; |
1a4d82fc | 1703 | use parse::token; |
9cc50fc6 | 1704 | use parse::token::str_to_ident; |
c34b1796 | 1705 | use std::io; |
9cc50fc6 | 1706 | use std::rc::Rc; |
1a4d82fc | 1707 | |
9cc50fc6 | 1708 | fn mk_sh(cm: Rc<CodeMap>) -> errors::Handler { |
c34b1796 | 1709 | // FIXME (#22405): Replace `Box::new` with `box` here when/if possible. |
3157f602 | 1710 | let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), |
5bcae85e | 1711 | Some(cm)); |
9cc50fc6 | 1712 | errors::Handler::with_emitter(true, false, Box::new(emitter)) |
1a4d82fc JJ |
1713 | } |
1714 | ||
1715 | // open a string reader for the given string | |
9cc50fc6 SL |
1716 | fn setup<'a>(cm: &CodeMap, |
1717 | span_handler: &'a errors::Handler, | |
1718 | teststr: String) | |
1719 | -> StringReader<'a> { | |
3157f602 | 1720 | let fm = cm.new_filemap("zebra.rs".to_string(), None, teststr); |
1a4d82fc JJ |
1721 | StringReader::new(span_handler, fm) |
1722 | } | |
1723 | ||
9cc50fc6 SL |
1724 | #[test] |
1725 | fn t1() { | |
1726 | let cm = Rc::new(CodeMap::new()); | |
1727 | let sh = mk_sh(cm.clone()); | |
1728 | let mut string_reader = setup(&cm, | |
1729 | &sh, | |
1730 | "/* my source file */ fn main() { println!(\"zebra\"); }\n" | |
1731 | .to_string()); | |
1a4d82fc JJ |
1732 | let id = str_to_ident("fn"); |
1733 | assert_eq!(string_reader.next_token().tok, token::Comment); | |
1734 | assert_eq!(string_reader.next_token().tok, token::Whitespace); | |
1735 | let tok1 = string_reader.next_token(); | |
9cc50fc6 | 1736 | let tok2 = TokenAndSpan { |
a7813a04 | 1737 | tok: token::Ident(id), |
9cc50fc6 SL |
1738 | sp: Span { |
1739 | lo: BytePos(21), | |
1740 | hi: BytePos(23), | |
1741 | expn_id: NO_EXPANSION, | |
1742 | }, | |
1743 | }; | |
1744 | assert_eq!(tok1, tok2); | |
1a4d82fc JJ |
1745 | assert_eq!(string_reader.next_token().tok, token::Whitespace); |
1746 | // the 'main' id is already read: | |
1747 | assert_eq!(string_reader.last_pos.clone(), BytePos(28)); | |
1748 | // read another token: | |
1749 | let tok3 = string_reader.next_token(); | |
9cc50fc6 | 1750 | let tok4 = TokenAndSpan { |
a7813a04 | 1751 | tok: token::Ident(str_to_ident("main")), |
9cc50fc6 SL |
1752 | sp: Span { |
1753 | lo: BytePos(24), | |
1754 | hi: BytePos(28), | |
1755 | expn_id: NO_EXPANSION, | |
1756 | }, | |
1757 | }; | |
1758 | assert_eq!(tok3, tok4); | |
1a4d82fc JJ |
1759 | // the lparen is already read: |
1760 | assert_eq!(string_reader.last_pos.clone(), BytePos(29)) | |
1761 | } | |
1762 | ||
1763 | // check that the given reader produces the desired stream | |
1764 | // of tokens (stop checking after exhausting the expected vec) | |
9cc50fc6 | 1765 | fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) { |
85aaf69f | 1766 | for expected_tok in &expected { |
1a4d82fc JJ |
1767 | assert_eq!(&string_reader.next_token().tok, expected_tok); |
1768 | } | |
1769 | } | |
1770 | ||
1771 | // make the identifier by looking up the string in the interner | |
a7813a04 XL |
1772 | fn mk_ident(id: &str) -> token::Token { |
1773 | token::Ident(str_to_ident(id)) | |
1a4d82fc JJ |
1774 | } |
1775 | ||
9cc50fc6 SL |
1776 | #[test] |
1777 | fn doublecolonparsing() { | |
1778 | let cm = Rc::new(CodeMap::new()); | |
1779 | let sh = mk_sh(cm.clone()); | |
1780 | check_tokenization(setup(&cm, &sh, "a b".to_string()), | |
a7813a04 | 1781 | vec![mk_ident("a"), token::Whitespace, mk_ident("b")]); |
1a4d82fc JJ |
1782 | } |
1783 | ||
9cc50fc6 SL |
1784 | #[test] |
1785 | fn dcparsing_2() { | |
1786 | let cm = Rc::new(CodeMap::new()); | |
1787 | let sh = mk_sh(cm.clone()); | |
1788 | check_tokenization(setup(&cm, &sh, "a::b".to_string()), | |
a7813a04 | 1789 | vec![mk_ident("a"), token::ModSep, mk_ident("b")]); |
1a4d82fc JJ |
1790 | } |
1791 | ||
9cc50fc6 SL |
1792 | #[test] |
1793 | fn dcparsing_3() { | |
1794 | let cm = Rc::new(CodeMap::new()); | |
1795 | let sh = mk_sh(cm.clone()); | |
1796 | check_tokenization(setup(&cm, &sh, "a ::b".to_string()), | |
a7813a04 | 1797 | vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]); |
1a4d82fc JJ |
1798 | } |
1799 | ||
9cc50fc6 SL |
1800 | #[test] |
1801 | fn dcparsing_4() { | |
1802 | let cm = Rc::new(CodeMap::new()); | |
1803 | let sh = mk_sh(cm.clone()); | |
1804 | check_tokenization(setup(&cm, &sh, "a:: b".to_string()), | |
a7813a04 | 1805 | vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]); |
1a4d82fc JJ |
1806 | } |
1807 | ||
9cc50fc6 SL |
1808 | #[test] |
1809 | fn character_a() { | |
1810 | let cm = Rc::new(CodeMap::new()); | |
1811 | let sh = mk_sh(cm.clone()); | |
1812 | assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok, | |
1a4d82fc JJ |
1813 | token::Literal(token::Char(token::intern("a")), None)); |
1814 | } | |
1815 | ||
9cc50fc6 SL |
1816 | #[test] |
1817 | fn character_space() { | |
1818 | let cm = Rc::new(CodeMap::new()); | |
1819 | let sh = mk_sh(cm.clone()); | |
1820 | assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok, | |
1a4d82fc JJ |
1821 | token::Literal(token::Char(token::intern(" ")), None)); |
1822 | } | |
1823 | ||
9cc50fc6 SL |
1824 | #[test] |
1825 | fn character_escaped() { | |
1826 | let cm = Rc::new(CodeMap::new()); | |
1827 | let sh = mk_sh(cm.clone()); | |
1828 | assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok, | |
1a4d82fc JJ |
1829 | token::Literal(token::Char(token::intern("\\n")), None)); |
1830 | } | |
1831 | ||
9cc50fc6 SL |
1832 | #[test] |
1833 | fn lifetime_name() { | |
1834 | let cm = Rc::new(CodeMap::new()); | |
1835 | let sh = mk_sh(cm.clone()); | |
1836 | assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok, | |
1a4d82fc JJ |
1837 | token::Lifetime(token::str_to_ident("'abc"))); |
1838 | } | |
1839 | ||
9cc50fc6 SL |
1840 | #[test] |
1841 | fn raw_string() { | |
1842 | let cm = Rc::new(CodeMap::new()); | |
1843 | let sh = mk_sh(cm.clone()); | |
1844 | assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string()) | |
1845 | .next_token() | |
1846 | .tok, | |
1a4d82fc JJ |
1847 | token::Literal(token::StrRaw(token::intern("\"#a\\b\x00c\""), 3), None)); |
1848 | } | |
1849 | ||
9cc50fc6 SL |
1850 | #[test] |
1851 | fn literal_suffixes() { | |
1852 | let cm = Rc::new(CodeMap::new()); | |
1853 | let sh = mk_sh(cm.clone()); | |
1a4d82fc JJ |
1854 | macro_rules! test { |
1855 | ($input: expr, $tok_type: ident, $tok_contents: expr) => {{ | |
9cc50fc6 | 1856 | assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok, |
1a4d82fc JJ |
1857 | token::Literal(token::$tok_type(token::intern($tok_contents)), |
1858 | Some(token::intern("suffix")))); | |
1859 | // with a whitespace separator: | |
9cc50fc6 | 1860 | assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok, |
1a4d82fc JJ |
1861 | token::Literal(token::$tok_type(token::intern($tok_contents)), |
1862 | None)); | |
1863 | }} | |
1864 | } | |
1865 | ||
1866 | test!("'a'", Char, "a"); | |
1867 | test!("b'a'", Byte, "a"); | |
1868 | test!("\"a\"", Str_, "a"); | |
e9174d1e | 1869 | test!("b\"a\"", ByteStr, "a"); |
1a4d82fc JJ |
1870 | test!("1234", Integer, "1234"); |
1871 | test!("0b101", Integer, "0b101"); | |
1872 | test!("0xABC", Integer, "0xABC"); | |
1873 | test!("1.0", Float, "1.0"); | |
1874 | test!("1.0e10", Float, "1.0e10"); | |
1875 | ||
9cc50fc6 | 1876 | assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok, |
1a4d82fc | 1877 | token::Literal(token::Integer(token::intern("2")), |
85aaf69f | 1878 | Some(token::intern("us")))); |
9cc50fc6 | 1879 | assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok, |
1a4d82fc JJ |
1880 | token::Literal(token::StrRaw(token::intern("raw"), 3), |
1881 | Some(token::intern("suffix")))); | |
9cc50fc6 | 1882 | assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok, |
e9174d1e | 1883 | token::Literal(token::ByteStrRaw(token::intern("raw"), 3), |
1a4d82fc JJ |
1884 | Some(token::intern("suffix")))); |
1885 | } | |
1886 | ||
9cc50fc6 SL |
1887 | #[test] |
1888 | fn line_doc_comments() { | |
1a4d82fc JJ |
1889 | assert!(is_doc_comment("///")); |
1890 | assert!(is_doc_comment("/// blah")); | |
1891 | assert!(!is_doc_comment("////")); | |
1892 | } | |
1893 | ||
9cc50fc6 SL |
1894 | #[test] |
1895 | fn nested_block_comments() { | |
1896 | let cm = Rc::new(CodeMap::new()); | |
1897 | let sh = mk_sh(cm.clone()); | |
1898 | let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string()); | |
1a4d82fc | 1899 | match lexer.next_token().tok { |
9cc50fc6 SL |
1900 | token::Comment => {} |
1901 | _ => panic!("expected a comment!"), | |
1a4d82fc | 1902 | } |
9cc50fc6 SL |
1903 | assert_eq!(lexer.next_token().tok, |
1904 | token::Literal(token::Char(token::intern("a")), None)); | |
1a4d82fc JJ |
1905 | } |
1906 | ||
9cc50fc6 SL |
1907 | #[test] |
1908 | fn crlf_comments() { | |
1909 | let cm = Rc::new(CodeMap::new()); | |
1910 | let sh = mk_sh(cm.clone()); | |
1911 | let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string()); | |
62682a34 SL |
1912 | let comment = lexer.next_token(); |
1913 | assert_eq!(comment.tok, token::Comment); | |
3157f602 | 1914 | assert_eq!(comment.sp, ::syntax_pos::mk_sp(BytePos(0), BytePos(7))); |
62682a34 | 1915 | assert_eq!(lexer.next_token().tok, token::Whitespace); |
9cc50fc6 SL |
1916 | assert_eq!(lexer.next_token().tok, |
1917 | token::DocComment(token::intern("/// test"))); | |
62682a34 | 1918 | } |
1a4d82fc | 1919 | } |