]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | use ast; | |
12 | use codemap::{BytePos, CharPos, CodeMap, Pos, Span}; | |
13 | use codemap; | |
9cc50fc6 | 14 | use errors::{FatalError, Handler, DiagnosticBuilder}; |
1a4d82fc | 15 | use ext::tt::transcribe::tt_next_token; |
c34b1796 | 16 | use parse::token::str_to_ident; |
d9579d0f AL |
17 | use parse::token; |
18 | use str::char_at; | |
54a0048b | 19 | use rustc_unicode::property::Pattern_White_Space; |
1a4d82fc | 20 | |
d9579d0f | 21 | use std::borrow::Cow; |
1a4d82fc | 22 | use std::char; |
1a4d82fc | 23 | use std::mem::replace; |
1a4d82fc | 24 | use std::rc::Rc; |
1a4d82fc JJ |
25 | |
26 | pub use ext::tt::transcribe::{TtReader, new_tt_reader, new_tt_reader_with_doc_flag}; | |
27 | ||
28 | pub mod comments; | |
92a42be0 | 29 | mod unicode_chars; |
1a4d82fc JJ |
30 | |
31 | pub trait Reader { | |
32 | fn is_eof(&self) -> bool; | |
33 | fn next_token(&mut self) -> TokenAndSpan; | |
34 | /// Report a fatal error with the current span. | |
92a42be0 | 35 | fn fatal(&self, &str) -> FatalError; |
1a4d82fc JJ |
36 | /// Report a non-fatal error with the current span. |
37 | fn err(&self, &str); | |
38 | fn peek(&self) -> TokenAndSpan; | |
39 | /// Get a token the parser cares about. | |
40 | fn real_token(&mut self) -> TokenAndSpan { | |
41 | let mut t = self.next_token(); | |
42 | loop { | |
43 | match t.tok { | |
44 | token::Whitespace | token::Comment | token::Shebang(_) => { | |
45 | t = self.next_token(); | |
9cc50fc6 SL |
46 | } |
47 | _ => break, | |
1a4d82fc JJ |
48 | } |
49 | } | |
50 | t | |
51 | } | |
52 | } | |
53 | ||
85aaf69f | 54 | #[derive(Clone, PartialEq, Eq, Debug)] |
1a4d82fc JJ |
55 | pub struct TokenAndSpan { |
56 | pub tok: token::Token, | |
57 | pub sp: Span, | |
58 | } | |
59 | ||
60 | pub struct StringReader<'a> { | |
9cc50fc6 | 61 | pub span_diagnostic: &'a Handler, |
1a4d82fc JJ |
62 | /// The absolute offset within the codemap of the next character to read |
63 | pub pos: BytePos, | |
64 | /// The absolute offset within the codemap of the last character read(curr) | |
65 | pub last_pos: BytePos, | |
66 | /// The column of the next character to read | |
67 | pub col: CharPos, | |
68 | /// The last character to be read | |
69 | pub curr: Option<char>, | |
70 | pub filemap: Rc<codemap::FileMap>, | |
9cc50fc6 | 71 | // cached: |
1a4d82fc JJ |
72 | pub peek_tok: token::Token, |
73 | pub peek_span: Span, | |
74 | ||
c34b1796 AL |
75 | // cache a direct reference to the source text, so that we don't have to |
76 | // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time. | |
9cc50fc6 | 77 | source_text: Rc<String>, |
1a4d82fc JJ |
78 | } |
79 | ||
80 | impl<'a> Reader for StringReader<'a> { | |
9cc50fc6 SL |
81 | fn is_eof(&self) -> bool { |
82 | self.curr.is_none() | |
83 | } | |
1a4d82fc JJ |
84 | /// Return the next token. EFFECT: advances the string_reader. |
85 | fn next_token(&mut self) -> TokenAndSpan { | |
86 | let ret_val = TokenAndSpan { | |
87 | tok: replace(&mut self.peek_tok, token::Underscore), | |
88 | sp: self.peek_span, | |
89 | }; | |
90 | self.advance_token(); | |
91 | ret_val | |
92 | } | |
92a42be0 | 93 | fn fatal(&self, m: &str) -> FatalError { |
1a4d82fc JJ |
94 | self.fatal_span(self.peek_span, m) |
95 | } | |
96 | fn err(&self, m: &str) { | |
97 | self.err_span(self.peek_span, m) | |
98 | } | |
99 | fn peek(&self) -> TokenAndSpan { | |
100 | // FIXME(pcwalton): Bad copy! | |
101 | TokenAndSpan { | |
102 | tok: self.peek_tok.clone(), | |
103 | sp: self.peek_span, | |
104 | } | |
105 | } | |
106 | } | |
107 | ||
108 | impl<'a> Reader for TtReader<'a> { | |
109 | fn is_eof(&self) -> bool { | |
110 | self.cur_tok == token::Eof | |
111 | } | |
112 | fn next_token(&mut self) -> TokenAndSpan { | |
113 | let r = tt_next_token(self); | |
114 | debug!("TtReader: r={:?}", r); | |
115 | r | |
116 | } | |
92a42be0 SL |
117 | fn fatal(&self, m: &str) -> FatalError { |
118 | self.sp_diag.span_fatal(self.cur_span, m) | |
1a4d82fc JJ |
119 | } |
120 | fn err(&self, m: &str) { | |
121 | self.sp_diag.span_err(self.cur_span, m); | |
122 | } | |
123 | fn peek(&self) -> TokenAndSpan { | |
124 | TokenAndSpan { | |
125 | tok: self.cur_tok.clone(), | |
126 | sp: self.cur_span, | |
127 | } | |
128 | } | |
129 | } | |
130 | ||
1a4d82fc JJ |
131 | impl<'a> StringReader<'a> { |
132 | /// For comments.rs, which hackily pokes into pos and curr | |
9cc50fc6 SL |
133 | pub fn new_raw<'b>(span_diagnostic: &'b Handler, |
134 | filemap: Rc<codemap::FileMap>) | |
135 | -> StringReader<'b> { | |
c34b1796 | 136 | if filemap.src.is_none() { |
9cc50fc6 SL |
137 | span_diagnostic.bug(&format!("Cannot lex filemap \ |
138 | without source: {}", | |
139 | filemap.name)[..]); | |
c34b1796 AL |
140 | } |
141 | ||
142 | let source_text = (*filemap.src.as_ref().unwrap()).clone(); | |
143 | ||
1a4d82fc JJ |
144 | let mut sr = StringReader { |
145 | span_diagnostic: span_diagnostic, | |
146 | pos: filemap.start_pos, | |
147 | last_pos: filemap.start_pos, | |
148 | col: CharPos(0), | |
149 | curr: Some('\n'), | |
150 | filemap: filemap, | |
9cc50fc6 | 151 | // dummy values; not read |
1a4d82fc JJ |
152 | peek_tok: token::Eof, |
153 | peek_span: codemap::DUMMY_SP, | |
9cc50fc6 | 154 | source_text: source_text, |
1a4d82fc JJ |
155 | }; |
156 | sr.bump(); | |
157 | sr | |
158 | } | |
159 | ||
9cc50fc6 SL |
160 | pub fn new<'b>(span_diagnostic: &'b Handler, |
161 | filemap: Rc<codemap::FileMap>) | |
162 | -> StringReader<'b> { | |
1a4d82fc JJ |
163 | let mut sr = StringReader::new_raw(span_diagnostic, filemap); |
164 | sr.advance_token(); | |
165 | sr | |
166 | } | |
167 | ||
168 | pub fn curr_is(&self, c: char) -> bool { | |
169 | self.curr == Some(c) | |
170 | } | |
171 | ||
172 | /// Report a fatal lexical error with a given span. | |
92a42be0 SL |
173 | pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError { |
174 | self.span_diagnostic.span_fatal(sp, m) | |
1a4d82fc JJ |
175 | } |
176 | ||
177 | /// Report a lexical error with a given span. | |
178 | pub fn err_span(&self, sp: Span, m: &str) { | |
179 | self.span_diagnostic.span_err(sp, m) | |
180 | } | |
181 | ||
c1a9b12d | 182 | |
1a4d82fc | 183 | /// Report a fatal error spanning [`from_pos`, `to_pos`). |
92a42be0 | 184 | fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError { |
1a4d82fc JJ |
185 | self.fatal_span(codemap::mk_sp(from_pos, to_pos), m) |
186 | } | |
187 | ||
188 | /// Report a lexical error spanning [`from_pos`, `to_pos`). | |
189 | fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) { | |
190 | self.err_span(codemap::mk_sp(from_pos, to_pos), m) | |
191 | } | |
192 | ||
193 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
194 | /// escaped character to the error message | |
92a42be0 | 195 | fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError { |
1a4d82fc JJ |
196 | let mut m = m.to_string(); |
197 | m.push_str(": "); | |
9cc50fc6 SL |
198 | for c in c.escape_default() { |
199 | m.push(c) | |
200 | } | |
92a42be0 | 201 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc | 202 | } |
9cc50fc6 SL |
203 | fn struct_fatal_span_char(&self, |
204 | from_pos: BytePos, | |
205 | to_pos: BytePos, | |
206 | m: &str, | |
207 | c: char) | |
208 | -> DiagnosticBuilder<'a> { | |
209 | let mut m = m.to_string(); | |
210 | m.push_str(": "); | |
211 | for c in c.escape_default() { | |
212 | m.push(c) | |
213 | } | |
214 | self.span_diagnostic.struct_span_fatal(codemap::mk_sp(from_pos, to_pos), &m[..]) | |
215 | } | |
1a4d82fc JJ |
216 | |
217 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an | |
218 | /// escaped character to the error message | |
219 | fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { | |
220 | let mut m = m.to_string(); | |
221 | m.push_str(": "); | |
9cc50fc6 SL |
222 | for c in c.escape_default() { |
223 | m.push(c) | |
224 | } | |
85aaf69f | 225 | self.err_span_(from_pos, to_pos, &m[..]); |
1a4d82fc | 226 | } |
9cc50fc6 SL |
227 | fn struct_err_span_char(&self, |
228 | from_pos: BytePos, | |
229 | to_pos: BytePos, | |
230 | m: &str, | |
231 | c: char) | |
232 | -> DiagnosticBuilder<'a> { | |
233 | let mut m = m.to_string(); | |
234 | m.push_str(": "); | |
235 | for c in c.escape_default() { | |
236 | m.push(c) | |
237 | } | |
238 | self.span_diagnostic.struct_span_err(codemap::mk_sp(from_pos, to_pos), &m[..]) | |
239 | } | |
1a4d82fc JJ |
240 | |
241 | /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the | |
242 | /// offending string to the error message | |
92a42be0 | 243 | fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError { |
1a4d82fc | 244 | m.push_str(": "); |
85aaf69f SL |
245 | let from = self.byte_offset(from_pos).to_usize(); |
246 | let to = self.byte_offset(to_pos).to_usize(); | |
c34b1796 | 247 | m.push_str(&self.source_text[from..to]); |
92a42be0 | 248 | self.fatal_span_(from_pos, to_pos, &m[..]) |
1a4d82fc JJ |
249 | } |
250 | ||
251 | /// Advance peek_tok and peek_span to refer to the next token, and | |
252 | /// possibly update the interner. | |
253 | fn advance_token(&mut self) { | |
254 | match self.scan_whitespace_or_comment() { | |
255 | Some(comment) => { | |
256 | self.peek_span = comment.sp; | |
257 | self.peek_tok = comment.tok; | |
9cc50fc6 | 258 | } |
1a4d82fc JJ |
259 | None => { |
260 | if self.is_eof() { | |
261 | self.peek_tok = token::Eof; | |
c1a9b12d | 262 | self.peek_span = codemap::mk_sp(self.filemap.end_pos, self.filemap.end_pos); |
1a4d82fc JJ |
263 | } else { |
264 | let start_bytepos = self.last_pos; | |
265 | self.peek_tok = self.next_token_inner(); | |
9cc50fc6 | 266 | self.peek_span = codemap::mk_sp(start_bytepos, self.last_pos); |
1a4d82fc JJ |
267 | }; |
268 | } | |
269 | } | |
270 | } | |
271 | ||
272 | fn byte_offset(&self, pos: BytePos) -> BytePos { | |
273 | (pos - self.filemap.start_pos) | |
274 | } | |
275 | ||
276 | /// Calls `f` with a string slice of the source text spanning from `start` | |
277 | /// up to but excluding `self.last_pos`, meaning the slice does not include | |
278 | /// the character `self.curr`. | |
9cc50fc6 SL |
279 | pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T |
280 | where F: FnOnce(&str) -> T | |
1a4d82fc JJ |
281 | { |
282 | self.with_str_from_to(start, self.last_pos, f) | |
283 | } | |
284 | ||
285 | /// Create a Name from a given offset to the current offset, each | |
286 | /// adjusted 1 towards each other (assumes that on either side there is a | |
287 | /// single-byte delimiter). | |
288 | pub fn name_from(&self, start: BytePos) -> ast::Name { | |
289 | debug!("taking an ident from {:?} to {:?}", start, self.last_pos); | |
290 | self.with_str_from(start, token::intern) | |
291 | } | |
292 | ||
293 | /// As name_from, with an explicit endpoint. | |
294 | pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name { | |
295 | debug!("taking an ident from {:?} to {:?}", start, end); | |
296 | self.with_str_from_to(start, end, token::intern) | |
297 | } | |
298 | ||
299 | /// Calls `f` with a string slice of the source text spanning from `start` | |
300 | /// up to but excluding `end`. | |
9cc50fc6 SL |
301 | fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T |
302 | where F: FnOnce(&str) -> T | |
1a4d82fc | 303 | { |
9cc50fc6 | 304 | f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()]) |
1a4d82fc JJ |
305 | } |
306 | ||
307 | /// Converts CRLF to LF in the given string, raising an error on bare CR. | |
9cc50fc6 | 308 | fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { |
85aaf69f | 309 | let mut i = 0; |
1a4d82fc | 310 | while i < s.len() { |
d9579d0f | 311 | let ch = char_at(s, i); |
c34b1796 | 312 | let next = i + ch.len_utf8(); |
1a4d82fc | 313 | if ch == '\r' { |
d9579d0f AL |
314 | if next < s.len() && char_at(s, next) == '\n' { |
315 | return translate_crlf_(self, start, s, errmsg, i).into(); | |
1a4d82fc JJ |
316 | } |
317 | let pos = start + BytePos(i as u32); | |
318 | let end_pos = start + BytePos(next as u32); | |
319 | self.err_span_(pos, end_pos, errmsg); | |
320 | } | |
321 | i = next; | |
322 | } | |
d9579d0f | 323 | return s.into(); |
1a4d82fc | 324 | |
9cc50fc6 SL |
325 | fn translate_crlf_(rdr: &StringReader, |
326 | start: BytePos, | |
327 | s: &str, | |
328 | errmsg: &str, | |
329 | mut i: usize) | |
330 | -> String { | |
1a4d82fc JJ |
331 | let mut buf = String::with_capacity(s.len()); |
332 | let mut j = 0; | |
333 | while i < s.len() { | |
d9579d0f | 334 | let ch = char_at(s, i); |
c34b1796 | 335 | let next = i + ch.len_utf8(); |
1a4d82fc | 336 | if ch == '\r' { |
9cc50fc6 SL |
337 | if j < i { |
338 | buf.push_str(&s[j..i]); | |
339 | } | |
1a4d82fc | 340 | j = next; |
d9579d0f | 341 | if next >= s.len() || char_at(s, next) != '\n' { |
1a4d82fc JJ |
342 | let pos = start + BytePos(i as u32); |
343 | let end_pos = start + BytePos(next as u32); | |
344 | rdr.err_span_(pos, end_pos, errmsg); | |
345 | } | |
346 | } | |
347 | i = next; | |
348 | } | |
9cc50fc6 SL |
349 | if j < s.len() { |
350 | buf.push_str(&s[j..]); | |
351 | } | |
1a4d82fc JJ |
352 | buf |
353 | } | |
354 | } | |
355 | ||
356 | ||
357 | /// Advance the StringReader by one character. If a newline is | |
358 | /// discovered, add it to the FileMap's list of line start offsets. | |
359 | pub fn bump(&mut self) { | |
360 | self.last_pos = self.pos; | |
85aaf69f | 361 | let current_byte_offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 362 | if current_byte_offset < self.source_text.len() { |
1a4d82fc JJ |
363 | assert!(self.curr.is_some()); |
364 | let last_char = self.curr.unwrap(); | |
d9579d0f | 365 | let ch = char_at(&self.source_text, current_byte_offset); |
c34b1796 AL |
366 | let next = current_byte_offset + ch.len_utf8(); |
367 | let byte_offset_diff = next - current_byte_offset; | |
85aaf69f | 368 | self.pos = self.pos + Pos::from_usize(byte_offset_diff); |
c34b1796 | 369 | self.curr = Some(ch); |
85aaf69f | 370 | self.col = self.col + CharPos(1); |
1a4d82fc JJ |
371 | if last_char == '\n' { |
372 | self.filemap.next_line(self.last_pos); | |
85aaf69f | 373 | self.col = CharPos(0); |
1a4d82fc JJ |
374 | } |
375 | ||
376 | if byte_offset_diff > 1 { | |
377 | self.filemap.record_multibyte_char(self.last_pos, byte_offset_diff); | |
378 | } | |
379 | } else { | |
380 | self.curr = None; | |
381 | } | |
382 | } | |
383 | ||
384 | pub fn nextch(&self) -> Option<char> { | |
85aaf69f | 385 | let offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 386 | if offset < self.source_text.len() { |
d9579d0f | 387 | Some(char_at(&self.source_text, offset)) |
1a4d82fc JJ |
388 | } else { |
389 | None | |
390 | } | |
391 | } | |
392 | ||
393 | pub fn nextch_is(&self, c: char) -> bool { | |
394 | self.nextch() == Some(c) | |
395 | } | |
396 | ||
397 | pub fn nextnextch(&self) -> Option<char> { | |
85aaf69f | 398 | let offset = self.byte_offset(self.pos).to_usize(); |
c34b1796 | 399 | let s = &self.source_text[..]; |
9cc50fc6 SL |
400 | if offset >= s.len() { |
401 | return None; | |
402 | } | |
d9579d0f | 403 | let next = offset + char_at(s, offset).len_utf8(); |
1a4d82fc | 404 | if next < s.len() { |
d9579d0f | 405 | Some(char_at(s, next)) |
1a4d82fc JJ |
406 | } else { |
407 | None | |
408 | } | |
409 | } | |
410 | ||
411 | pub fn nextnextch_is(&self, c: char) -> bool { | |
412 | self.nextnextch() == Some(c) | |
413 | } | |
414 | ||
415 | /// Eats <XID_start><XID_continue>*, if possible. | |
416 | fn scan_optional_raw_name(&mut self) -> Option<ast::Name> { | |
417 | if !ident_start(self.curr) { | |
9cc50fc6 | 418 | return None; |
1a4d82fc JJ |
419 | } |
420 | let start = self.last_pos; | |
421 | while ident_continue(self.curr) { | |
422 | self.bump(); | |
423 | } | |
424 | ||
425 | self.with_str_from(start, |string| { | |
426 | if string == "_" { | |
427 | None | |
428 | } else { | |
429 | Some(token::intern(string)) | |
430 | } | |
431 | }) | |
432 | } | |
433 | ||
434 | /// PRECONDITION: self.curr is not whitespace | |
435 | /// Eats any kind of comment. | |
436 | fn scan_comment(&mut self) -> Option<TokenAndSpan> { | |
437 | match self.curr { | |
438 | Some(c) => { | |
439 | if c.is_whitespace() { | |
440 | self.span_diagnostic.span_err(codemap::mk_sp(self.last_pos, self.last_pos), | |
9cc50fc6 SL |
441 | "called consume_any_line_comment, but there \ |
442 | was whitespace"); | |
1a4d82fc | 443 | } |
9cc50fc6 SL |
444 | } |
445 | None => {} | |
1a4d82fc JJ |
446 | } |
447 | ||
448 | if self.curr_is('/') { | |
449 | match self.nextch() { | |
450 | Some('/') => { | |
451 | self.bump(); | |
452 | self.bump(); | |
62682a34 | 453 | |
1a4d82fc | 454 | // line comments starting with "///" or "//!" are doc-comments |
62682a34 SL |
455 | let doc_comment = self.curr_is('/') || self.curr_is('!'); |
456 | let start_bpos = if doc_comment { | |
457 | self.pos - BytePos(3) | |
458 | } else { | |
459 | self.last_pos - BytePos(2) | |
460 | }; | |
461 | ||
462 | while !self.is_eof() { | |
463 | match self.curr.unwrap() { | |
464 | '\n' => break, | |
465 | '\r' => { | |
466 | if self.nextch_is('\n') { | |
467 | // CRLF | |
9cc50fc6 | 468 | break; |
62682a34 | 469 | } else if doc_comment { |
9cc50fc6 SL |
470 | self.err_span_(self.last_pos, |
471 | self.pos, | |
62682a34 | 472 | "bare CR not allowed in doc-comment"); |
1a4d82fc | 473 | } |
1a4d82fc | 474 | } |
9cc50fc6 | 475 | _ => (), |
1a4d82fc | 476 | } |
62682a34 SL |
477 | self.bump(); |
478 | } | |
479 | ||
480 | return if doc_comment { | |
481 | self.with_str_from(start_bpos, |string| { | |
482 | // comments with only more "/"s are not doc comments | |
1a4d82fc JJ |
483 | let tok = if is_doc_comment(string) { |
484 | token::DocComment(token::intern(string)) | |
485 | } else { | |
486 | token::Comment | |
487 | }; | |
488 | ||
62682a34 | 489 | Some(TokenAndSpan { |
1a4d82fc | 490 | tok: tok, |
9cc50fc6 | 491 | sp: codemap::mk_sp(start_bpos, self.last_pos), |
62682a34 SL |
492 | }) |
493 | }) | |
1a4d82fc | 494 | } else { |
62682a34 | 495 | Some(TokenAndSpan { |
1a4d82fc | 496 | tok: token::Comment, |
9cc50fc6 | 497 | sp: codemap::mk_sp(start_bpos, self.last_pos), |
62682a34 | 498 | }) |
9cc50fc6 | 499 | }; |
1a4d82fc JJ |
500 | } |
501 | Some('*') => { | |
9cc50fc6 SL |
502 | self.bump(); |
503 | self.bump(); | |
1a4d82fc JJ |
504 | self.scan_block_comment() |
505 | } | |
9cc50fc6 | 506 | _ => None, |
1a4d82fc JJ |
507 | } |
508 | } else if self.curr_is('#') { | |
509 | if self.nextch_is('!') { | |
510 | ||
511 | // Parse an inner attribute. | |
512 | if self.nextnextch_is('[') { | |
513 | return None; | |
514 | } | |
515 | ||
516 | // I guess this is the only way to figure out if | |
517 | // we're at the beginning of the file... | |
518 | let cmap = CodeMap::new(); | |
519 | cmap.files.borrow_mut().push(self.filemap.clone()); | |
520 | let loc = cmap.lookup_char_pos_adj(self.last_pos); | |
521 | debug!("Skipping a shebang"); | |
85aaf69f | 522 | if loc.line == 1 && loc.col == CharPos(0) { |
1a4d82fc JJ |
523 | // FIXME: Add shebang "token", return it |
524 | let start = self.last_pos; | |
9cc50fc6 SL |
525 | while !self.curr_is('\n') && !self.is_eof() { |
526 | self.bump(); | |
527 | } | |
1a4d82fc JJ |
528 | return Some(TokenAndSpan { |
529 | tok: token::Shebang(self.name_from(start)), | |
9cc50fc6 | 530 | sp: codemap::mk_sp(start, self.last_pos), |
1a4d82fc JJ |
531 | }); |
532 | } | |
533 | } | |
534 | None | |
535 | } else { | |
536 | None | |
537 | } | |
538 | } | |
539 | ||
540 | /// If there is whitespace, shebang, or a comment, scan it. Otherwise, | |
541 | /// return None. | |
542 | fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> { | |
543 | match self.curr.unwrap_or('\0') { | |
544 | // # to handle shebang at start of file -- this is the entry point | |
545 | // for skipping over all "junk" | |
546 | '/' | '#' => { | |
547 | let c = self.scan_comment(); | |
548 | debug!("scanning a comment {:?}", c); | |
549 | c | |
54a0048b SL |
550 | }, |
551 | c if is_pattern_whitespace(Some(c)) => { | |
1a4d82fc | 552 | let start_bpos = self.last_pos; |
54a0048b | 553 | while is_pattern_whitespace(self.curr) { |
9cc50fc6 SL |
554 | self.bump(); |
555 | } | |
1a4d82fc JJ |
556 | let c = Some(TokenAndSpan { |
557 | tok: token::Whitespace, | |
9cc50fc6 | 558 | sp: codemap::mk_sp(start_bpos, self.last_pos), |
1a4d82fc JJ |
559 | }); |
560 | debug!("scanning whitespace: {:?}", c); | |
561 | c | |
9cc50fc6 SL |
562 | } |
563 | _ => None, | |
1a4d82fc JJ |
564 | } |
565 | } | |
566 | ||
567 | /// Might return a sugared-doc-attr | |
568 | fn scan_block_comment(&mut self) -> Option<TokenAndSpan> { | |
569 | // block comments starting with "/**" or "/*!" are doc-comments | |
570 | let is_doc_comment = self.curr_is('*') || self.curr_is('!'); | |
571 | let start_bpos = self.last_pos - BytePos(2); | |
572 | ||
85aaf69f | 573 | let mut level: isize = 1; |
1a4d82fc JJ |
574 | let mut has_cr = false; |
575 | while level > 0 { | |
576 | if self.is_eof() { | |
577 | let msg = if is_doc_comment { | |
578 | "unterminated block doc-comment" | |
579 | } else { | |
580 | "unterminated block comment" | |
581 | }; | |
582 | let last_bpos = self.last_pos; | |
92a42be0 | 583 | panic!(self.fatal_span_(start_bpos, last_bpos, msg)); |
1a4d82fc JJ |
584 | } |
585 | let n = self.curr.unwrap(); | |
586 | match n { | |
587 | '/' if self.nextch_is('*') => { | |
588 | level += 1; | |
589 | self.bump(); | |
590 | } | |
591 | '*' if self.nextch_is('/') => { | |
592 | level -= 1; | |
593 | self.bump(); | |
594 | } | |
595 | '\r' => { | |
596 | has_cr = true; | |
597 | } | |
9cc50fc6 | 598 | _ => (), |
1a4d82fc JJ |
599 | } |
600 | self.bump(); | |
601 | } | |
602 | ||
603 | self.with_str_from(start_bpos, |string| { | |
604 | // but comments with only "*"s between two "/"s are not | |
605 | let tok = if is_block_doc_comment(string) { | |
606 | let string = if has_cr { | |
9cc50fc6 SL |
607 | self.translate_crlf(start_bpos, |
608 | string, | |
1a4d82fc | 609 | "bare CR not allowed in block doc-comment") |
9cc50fc6 SL |
610 | } else { |
611 | string.into() | |
612 | }; | |
85aaf69f | 613 | token::DocComment(token::intern(&string[..])) |
1a4d82fc JJ |
614 | } else { |
615 | token::Comment | |
616 | }; | |
617 | ||
9cc50fc6 | 618 | Some(TokenAndSpan { |
1a4d82fc | 619 | tok: tok, |
9cc50fc6 | 620 | sp: codemap::mk_sp(start_bpos, self.last_pos), |
1a4d82fc JJ |
621 | }) |
622 | }) | |
623 | } | |
624 | ||
c34b1796 AL |
625 | /// Scan through any digits (base `scan_radix`) or underscores, |
626 | /// and return how many digits there were. | |
627 | /// | |
628 | /// `real_radix` represents the true radix of the number we're | |
629 | /// interested in, and errors will be emitted for any digits | |
630 | /// between `real_radix` and `scan_radix`. | |
631 | fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize { | |
632 | assert!(real_radix <= scan_radix); | |
85aaf69f | 633 | let mut len = 0; |
1a4d82fc JJ |
634 | loop { |
635 | let c = self.curr; | |
9cc50fc6 SL |
636 | if c == Some('_') { |
637 | debug!("skipping a _"); | |
638 | self.bump(); | |
639 | continue; | |
640 | } | |
c34b1796 | 641 | match c.and_then(|cc| cc.to_digit(scan_radix)) { |
1a4d82fc JJ |
642 | Some(_) => { |
643 | debug!("{:?} in scan_digits", c); | |
c34b1796 AL |
644 | // check that the hypothetical digit is actually |
645 | // in range for the true radix | |
646 | if c.unwrap().to_digit(real_radix).is_none() { | |
9cc50fc6 SL |
647 | self.err_span_(self.last_pos, |
648 | self.pos, | |
649 | &format!("invalid digit for a base {} literal", real_radix)); | |
c34b1796 | 650 | } |
1a4d82fc JJ |
651 | len += 1; |
652 | self.bump(); | |
653 | } | |
9cc50fc6 | 654 | _ => return len, |
1a4d82fc | 655 | } |
9cc50fc6 | 656 | } |
1a4d82fc JJ |
657 | } |
658 | ||
659 | /// Lex a LIT_INTEGER or a LIT_FLOAT | |
660 | fn scan_number(&mut self, c: char) -> token::Lit { | |
c1a9b12d | 661 | let num_digits; |
1a4d82fc JJ |
662 | let mut base = 10; |
663 | let start_bpos = self.last_pos; | |
664 | ||
665 | self.bump(); | |
666 | ||
667 | if c == '0' { | |
668 | match self.curr.unwrap_or('\0') { | |
9cc50fc6 SL |
669 | 'b' => { |
670 | self.bump(); | |
671 | base = 2; | |
672 | num_digits = self.scan_digits(2, 10); | |
673 | } | |
674 | 'o' => { | |
675 | self.bump(); | |
676 | base = 8; | |
677 | num_digits = self.scan_digits(8, 10); | |
678 | } | |
679 | 'x' => { | |
680 | self.bump(); | |
681 | base = 16; | |
682 | num_digits = self.scan_digits(16, 16); | |
683 | } | |
1a4d82fc | 684 | '0'...'9' | '_' | '.' => { |
c34b1796 | 685 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
686 | } |
687 | _ => { | |
688 | // just a 0 | |
689 | return token::Integer(self.name_from(start_bpos)); | |
690 | } | |
691 | } | |
692 | } else if c.is_digit(10) { | |
c34b1796 | 693 | num_digits = self.scan_digits(10, 10) + 1; |
1a4d82fc JJ |
694 | } else { |
695 | num_digits = 0; | |
696 | } | |
697 | ||
698 | if num_digits == 0 { | |
9cc50fc6 SL |
699 | self.err_span_(start_bpos, |
700 | self.last_pos, | |
701 | "no valid digits found for number"); | |
1a4d82fc JJ |
702 | return token::Integer(token::intern("0")); |
703 | } | |
704 | ||
705 | // might be a float, but don't be greedy if this is actually an | |
706 | // integer literal followed by field/method access or a range pattern | |
707 | // (`0..2` and `12.foo()`) | |
9cc50fc6 SL |
708 | if self.curr_is('.') && !self.nextch_is('.') && |
709 | !self.nextch() | |
710 | .unwrap_or('\0') | |
711 | .is_xid_start() { | |
1a4d82fc JJ |
712 | // might have stuff after the ., and if it does, it needs to start |
713 | // with a number | |
714 | self.bump(); | |
715 | if self.curr.unwrap_or('\0').is_digit(10) { | |
c34b1796 | 716 | self.scan_digits(10, 10); |
1a4d82fc JJ |
717 | self.scan_float_exponent(); |
718 | } | |
719 | let last_pos = self.last_pos; | |
720 | self.check_float_base(start_bpos, last_pos, base); | |
721 | return token::Float(self.name_from(start_bpos)); | |
722 | } else { | |
723 | // it might be a float if it has an exponent | |
724 | if self.curr_is('e') || self.curr_is('E') { | |
725 | self.scan_float_exponent(); | |
726 | let last_pos = self.last_pos; | |
727 | self.check_float_base(start_bpos, last_pos, base); | |
728 | return token::Float(self.name_from(start_bpos)); | |
729 | } | |
730 | // but we certainly have an integer! | |
731 | return token::Integer(self.name_from(start_bpos)); | |
732 | } | |
733 | } | |
734 | ||
735 | /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an | |
736 | /// error if too many or too few digits are encountered. | |
9cc50fc6 | 737 | fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool { |
1a4d82fc JJ |
738 | debug!("scanning {} digits until {:?}", n_digits, delim); |
739 | let start_bpos = self.last_pos; | |
740 | let mut accum_int = 0; | |
741 | ||
c34b1796 | 742 | let mut valid = true; |
85aaf69f | 743 | for _ in 0..n_digits { |
1a4d82fc JJ |
744 | if self.is_eof() { |
745 | let last_bpos = self.last_pos; | |
92a42be0 SL |
746 | panic!(self.fatal_span_(start_bpos, |
747 | last_bpos, | |
748 | "unterminated numeric character escape")); | |
1a4d82fc JJ |
749 | } |
750 | if self.curr_is(delim) { | |
751 | let last_bpos = self.last_pos; | |
9cc50fc6 SL |
752 | self.err_span_(start_bpos, |
753 | last_bpos, | |
754 | "numeric character escape is too short"); | |
c34b1796 | 755 | valid = false; |
1a4d82fc JJ |
756 | break; |
757 | } | |
758 | let c = self.curr.unwrap_or('\x00'); | |
759 | accum_int *= 16; | |
760 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
9cc50fc6 SL |
761 | self.err_span_char(self.last_pos, |
762 | self.pos, | |
763 | "invalid character in numeric character escape", | |
764 | c); | |
c34b1796 AL |
765 | |
766 | valid = false; | |
1a4d82fc | 767 | 0 |
c34b1796 | 768 | }); |
1a4d82fc JJ |
769 | self.bump(); |
770 | } | |
771 | ||
772 | if below_0x7f_only && accum_int >= 0x80 { | |
773 | self.err_span_(start_bpos, | |
774 | self.last_pos, | |
9cc50fc6 SL |
775 | "this form of character escape may only be used with characters in \ |
776 | the range [\\x00-\\x7f]"); | |
c34b1796 | 777 | valid = false; |
1a4d82fc JJ |
778 | } |
779 | ||
780 | match char::from_u32(accum_int) { | |
c34b1796 | 781 | Some(_) => valid, |
1a4d82fc JJ |
782 | None => { |
783 | let last_bpos = self.last_pos; | |
c1a9b12d | 784 | self.err_span_(start_bpos, last_bpos, "invalid numeric character escape"); |
1a4d82fc JJ |
785 | false |
786 | } | |
787 | } | |
788 | } | |
789 | ||
1a4d82fc JJ |
790 | /// Scan for a single (possibly escaped) byte or char |
791 | /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. | |
792 | /// `start` is the position of `first_source_char`, which is already consumed. | |
793 | /// | |
794 | /// Returns true if there was a valid char/byte, false otherwise. | |
9cc50fc6 SL |
795 | fn scan_char_or_byte(&mut self, |
796 | start: BytePos, | |
797 | first_source_char: char, | |
798 | ascii_only: bool, | |
799 | delim: char) | |
800 | -> bool { | |
1a4d82fc JJ |
801 | match first_source_char { |
802 | '\\' => { | |
803 | // '\X' for some X must be a character constant: | |
804 | let escaped = self.curr; | |
805 | let escaped_pos = self.last_pos; | |
806 | self.bump(); | |
807 | match escaped { | |
9cc50fc6 | 808 | None => {} // EOF here is an error that will be checked later. |
1a4d82fc JJ |
809 | Some(e) => { |
810 | return match e { | |
811 | 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, | |
812 | 'x' => self.scan_byte_escape(delim, !ascii_only), | |
c1a9b12d SL |
813 | 'u' => { |
814 | let valid = if self.curr_is('{') { | |
815 | self.scan_unicode_escape(delim) && !ascii_only | |
816 | } else { | |
9cc50fc6 SL |
817 | let span = codemap::mk_sp(start, self.last_pos); |
818 | self.span_diagnostic | |
819 | .struct_span_err(span, "incorrect unicode escape sequence") | |
820 | .span_help(span, | |
821 | "format of unicode escape sequences is \ | |
822 | `\\u{…}`") | |
823 | .emit(); | |
c1a9b12d SL |
824 | false |
825 | }; | |
826 | if ascii_only { | |
9cc50fc6 SL |
827 | self.err_span_(start, |
828 | self.last_pos, | |
829 | "unicode escape sequences cannot be used as a \ | |
830 | byte or in a byte string"); | |
62682a34 | 831 | } |
c1a9b12d SL |
832 | valid |
833 | ||
1a4d82fc JJ |
834 | } |
835 | '\n' if delim == '"' => { | |
836 | self.consume_whitespace(); | |
837 | true | |
9cc50fc6 | 838 | } |
1a4d82fc JJ |
839 | '\r' if delim == '"' && self.curr_is('\n') => { |
840 | self.consume_whitespace(); | |
841 | true | |
842 | } | |
843 | c => { | |
844 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
845 | let mut err = self.struct_err_span_char(escaped_pos, |
846 | last_pos, | |
847 | if ascii_only { | |
848 | "unknown byte escape" | |
849 | } else { | |
850 | "unknown character \ | |
851 | escape" | |
852 | }, | |
853 | c); | |
1a4d82fc | 854 | if e == '\r' { |
9cc50fc6 SL |
855 | err.span_help(codemap::mk_sp(escaped_pos, last_pos), |
856 | "this is an isolated carriage return; consider \ | |
857 | checking your editor and version control \ | |
858 | settings"); | |
1a4d82fc | 859 | } |
9346a6ac | 860 | if (e == '{' || e == '}') && !ascii_only { |
9cc50fc6 SL |
861 | err.span_help(codemap::mk_sp(escaped_pos, last_pos), |
862 | "if used in a formatting string, curly braces \ | |
863 | are escaped with `{{` and `}}`"); | |
9346a6ac | 864 | } |
9cc50fc6 | 865 | err.emit(); |
1a4d82fc JJ |
866 | false |
867 | } | |
868 | } | |
869 | } | |
870 | } | |
871 | } | |
872 | '\t' | '\n' | '\r' | '\'' if delim == '\'' => { | |
873 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
874 | self.err_span_char(start, |
875 | last_pos, | |
876 | if ascii_only { | |
877 | "byte constant must be escaped" | |
878 | } else { | |
879 | "character constant must be escaped" | |
880 | }, | |
881 | first_source_char); | |
1a4d82fc JJ |
882 | return false; |
883 | } | |
884 | '\r' => { | |
885 | if self.curr_is('\n') { | |
886 | self.bump(); | |
887 | return true; | |
888 | } else { | |
9cc50fc6 SL |
889 | self.err_span_(start, |
890 | self.last_pos, | |
1a4d82fc JJ |
891 | "bare CR not allowed in string, use \\r instead"); |
892 | return false; | |
893 | } | |
894 | } | |
9cc50fc6 SL |
895 | _ => { |
896 | if ascii_only && first_source_char > '\x7F' { | |
897 | let last_pos = self.last_pos; | |
898 | self.err_span_char(start, | |
899 | last_pos, | |
900 | "byte constant must be ASCII. Use a \\xHH escape for a \ | |
901 | non-ASCII byte", | |
902 | first_source_char); | |
903 | return false; | |
904 | } | |
1a4d82fc JJ |
905 | } |
906 | } | |
907 | true | |
908 | } | |
909 | ||
910 | /// Scan over a \u{...} escape | |
911 | /// | |
912 | /// At this point, we have already seen the \ and the u, the { is the current character. We | |
913 | /// will read at least one digit, and up to 6, and pass over the }. | |
914 | fn scan_unicode_escape(&mut self, delim: char) -> bool { | |
915 | self.bump(); // past the { | |
916 | let start_bpos = self.last_pos; | |
85aaf69f | 917 | let mut count = 0; |
1a4d82fc | 918 | let mut accum_int = 0; |
c34b1796 | 919 | let mut valid = true; |
1a4d82fc JJ |
920 | |
921 | while !self.curr_is('}') && count <= 6 { | |
922 | let c = match self.curr { | |
923 | Some(c) => c, | |
924 | None => { | |
9cc50fc6 SL |
925 | panic!(self.fatal_span_(start_bpos, |
926 | self.last_pos, | |
92a42be0 | 927 | "unterminated unicode escape (found EOF)")); |
1a4d82fc JJ |
928 | } |
929 | }; | |
930 | accum_int *= 16; | |
931 | accum_int += c.to_digit(16).unwrap_or_else(|| { | |
932 | if c == delim { | |
9cc50fc6 SL |
933 | panic!(self.fatal_span_(self.last_pos, |
934 | self.pos, | |
92a42be0 | 935 | "unterminated unicode escape (needed a `}`)")); |
1a4d82fc | 936 | } else { |
9cc50fc6 SL |
937 | self.err_span_char(self.last_pos, |
938 | self.pos, | |
939 | "invalid character in unicode escape", | |
940 | c); | |
1a4d82fc | 941 | } |
c34b1796 AL |
942 | valid = false; |
943 | 0 | |
944 | }); | |
1a4d82fc JJ |
945 | self.bump(); |
946 | count += 1; | |
947 | } | |
948 | ||
949 | if count > 6 { | |
9cc50fc6 SL |
950 | self.err_span_(start_bpos, |
951 | self.last_pos, | |
952 | "overlong unicode escape (can have at most 6 hex digits)"); | |
c34b1796 | 953 | valid = false; |
1a4d82fc JJ |
954 | } |
955 | ||
c34b1796 | 956 | if valid && (char::from_u32(accum_int).is_none() || count == 0) { |
9cc50fc6 SL |
957 | self.err_span_(start_bpos, |
958 | self.last_pos, | |
959 | "invalid unicode character escape"); | |
62682a34 | 960 | valid = false; |
1a4d82fc JJ |
961 | } |
962 | ||
c1a9b12d | 963 | self.bump(); // past the ending } |
1a4d82fc JJ |
964 | valid |
965 | } | |
966 | ||
967 | /// Scan over a float exponent. | |
968 | fn scan_float_exponent(&mut self) { | |
969 | if self.curr_is('e') || self.curr_is('E') { | |
970 | self.bump(); | |
971 | if self.curr_is('-') || self.curr_is('+') { | |
972 | self.bump(); | |
973 | } | |
c34b1796 | 974 | if self.scan_digits(10, 10) == 0 { |
9cc50fc6 SL |
975 | self.err_span_(self.last_pos, |
976 | self.pos, | |
977 | "expected at least one digit in exponent") | |
1a4d82fc JJ |
978 | } |
979 | } | |
980 | } | |
981 | ||
982 | /// Check that a base is valid for a floating literal, emitting a nice | |
983 | /// error if it isn't. | |
85aaf69f | 984 | fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) { |
1a4d82fc | 985 | match base { |
9cc50fc6 SL |
986 | 16 => { |
987 | self.err_span_(start_bpos, | |
988 | last_bpos, | |
989 | "hexadecimal float literal is not supported") | |
990 | } | |
991 | 8 => { | |
992 | self.err_span_(start_bpos, | |
993 | last_bpos, | |
994 | "octal float literal is not supported") | |
995 | } | |
996 | 2 => { | |
997 | self.err_span_(start_bpos, | |
998 | last_bpos, | |
999 | "binary float literal is not supported") | |
1000 | } | |
1001 | _ => (), | |
1a4d82fc JJ |
1002 | } |
1003 | } | |
1004 | ||
1005 | fn binop(&mut self, op: token::BinOpToken) -> token::Token { | |
1006 | self.bump(); | |
1007 | if self.curr_is('=') { | |
1008 | self.bump(); | |
1009 | return token::BinOpEq(op); | |
1010 | } else { | |
1011 | return token::BinOp(op); | |
1012 | } | |
1013 | } | |
1014 | ||
1015 | /// Return the next token from the string, advances the input past that | |
1016 | /// token, and updates the interner | |
1017 | fn next_token_inner(&mut self) -> token::Token { | |
1018 | let c = self.curr; | |
9cc50fc6 SL |
1019 | if ident_start(c) && |
1020 | match (c.unwrap(), self.nextch(), self.nextnextch()) { | |
1a4d82fc JJ |
1021 | // Note: r as in r" or r#" is part of a raw string literal, |
1022 | // b as in b' is part of a byte literal. | |
1023 | // They are not identifiers, and are handled further down. | |
9cc50fc6 SL |
1024 | ('r', Some('"'), _) | |
1025 | ('r', Some('#'), _) | | |
1026 | ('b', Some('"'), _) | | |
1027 | ('b', Some('\''), _) | | |
1028 | ('b', Some('r'), Some('"')) | | |
1029 | ('b', Some('r'), Some('#')) => false, | |
1030 | _ => true, | |
1a4d82fc JJ |
1031 | } { |
1032 | let start = self.last_pos; | |
1033 | while ident_continue(self.curr) { | |
1034 | self.bump(); | |
1035 | } | |
1036 | ||
1037 | return self.with_str_from(start, |string| { | |
1038 | if string == "_" { | |
1039 | token::Underscore | |
1040 | } else { | |
1041 | // FIXME: perform NFKC normalization here. (Issue #2253) | |
1042 | if self.curr_is(':') && self.nextch_is(':') { | |
1043 | token::Ident(str_to_ident(string), token::ModName) | |
1044 | } else { | |
1045 | token::Ident(str_to_ident(string), token::Plain) | |
1046 | } | |
1047 | } | |
1048 | }); | |
1049 | } | |
1050 | ||
1051 | if is_dec_digit(c) { | |
1052 | let num = self.scan_number(c.unwrap()); | |
1053 | let suffix = self.scan_optional_raw_name(); | |
1054 | debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix); | |
9cc50fc6 | 1055 | return token::Literal(num, suffix); |
1a4d82fc JJ |
1056 | } |
1057 | ||
1a4d82fc | 1058 | match c.expect("next_token_inner called at EOF") { |
9cc50fc6 SL |
1059 | // One-byte tokens. |
1060 | ';' => { | |
1a4d82fc | 1061 | self.bump(); |
9cc50fc6 SL |
1062 | return token::Semi; |
1063 | } | |
1064 | ',' => { | |
1065 | self.bump(); | |
1066 | return token::Comma; | |
1067 | } | |
1068 | '.' => { | |
1069 | self.bump(); | |
1070 | return if self.curr_is('.') { | |
1071 | self.bump(); | |
1072 | if self.curr_is('.') { | |
1073 | self.bump(); | |
1074 | token::DotDotDot | |
1075 | } else { | |
1076 | token::DotDot | |
1077 | } | |
1078 | } else { | |
1079 | token::Dot | |
1080 | }; | |
1081 | } | |
1082 | '(' => { | |
1083 | self.bump(); | |
1084 | return token::OpenDelim(token::Paren); | |
1085 | } | |
1086 | ')' => { | |
1087 | self.bump(); | |
1088 | return token::CloseDelim(token::Paren); | |
1089 | } | |
1090 | '{' => { | |
1091 | self.bump(); | |
1092 | return token::OpenDelim(token::Brace); | |
1093 | } | |
1094 | '}' => { | |
1095 | self.bump(); | |
1096 | return token::CloseDelim(token::Brace); | |
1097 | } | |
1098 | '[' => { | |
1099 | self.bump(); | |
1100 | return token::OpenDelim(token::Bracket); | |
1101 | } | |
1102 | ']' => { | |
1103 | self.bump(); | |
1104 | return token::CloseDelim(token::Bracket); | |
1105 | } | |
1106 | '@' => { | |
1107 | self.bump(); | |
1108 | return token::At; | |
1109 | } | |
1110 | '#' => { | |
1111 | self.bump(); | |
1112 | return token::Pound; | |
1113 | } | |
1114 | '~' => { | |
1115 | self.bump(); | |
1116 | return token::Tilde; | |
1117 | } | |
1118 | '?' => { | |
1119 | self.bump(); | |
1120 | return token::Question; | |
1121 | } | |
1122 | ':' => { | |
1123 | self.bump(); | |
1124 | if self.curr_is(':') { | |
1125 | self.bump(); | |
1126 | return token::ModSep; | |
1127 | } else { | |
1128 | return token::Colon; | |
1129 | } | |
1a4d82fc | 1130 | } |
1a4d82fc | 1131 | |
9cc50fc6 SL |
1132 | '$' => { |
1133 | self.bump(); | |
1134 | return token::Dollar; | |
1135 | } | |
1a4d82fc | 1136 | |
9cc50fc6 SL |
1137 | // Multi-byte tokens. |
1138 | '=' => { | |
1a4d82fc | 1139 | self.bump(); |
9cc50fc6 SL |
1140 | if self.curr_is('=') { |
1141 | self.bump(); | |
1142 | return token::EqEq; | |
1143 | } else if self.curr_is('>') { | |
1144 | self.bump(); | |
1145 | return token::FatArrow; | |
1146 | } else { | |
1147 | return token::Eq; | |
1148 | } | |
1149 | } | |
1150 | '!' => { | |
1a4d82fc | 1151 | self.bump(); |
9cc50fc6 SL |
1152 | if self.curr_is('=') { |
1153 | self.bump(); | |
1154 | return token::Ne; | |
1155 | } else { | |
1156 | return token::Not; | |
1157 | } | |
1a4d82fc | 1158 | } |
9cc50fc6 | 1159 | '<' => { |
1a4d82fc | 1160 | self.bump(); |
9cc50fc6 SL |
1161 | match self.curr.unwrap_or('\x00') { |
1162 | '=' => { | |
1163 | self.bump(); | |
1164 | return token::Le; | |
1165 | } | |
1166 | '<' => { | |
1167 | return self.binop(token::Shl); | |
1168 | } | |
1169 | '-' => { | |
1170 | self.bump(); | |
1171 | match self.curr.unwrap_or('\x00') { | |
1172 | _ => { | |
1173 | return token::LArrow; | |
1174 | } | |
1175 | } | |
1176 | } | |
1177 | _ => { | |
1178 | return token::Lt; | |
1179 | } | |
1180 | } | |
1181 | } | |
1182 | '>' => { | |
1a4d82fc JJ |
1183 | self.bump(); |
1184 | match self.curr.unwrap_or('\x00') { | |
9cc50fc6 SL |
1185 | '=' => { |
1186 | self.bump(); | |
1187 | return token::Ge; | |
1188 | } | |
1189 | '>' => { | |
1190 | return self.binop(token::Shr); | |
1191 | } | |
1192 | _ => { | |
1193 | return token::Gt; | |
1194 | } | |
1a4d82fc | 1195 | } |
1a4d82fc | 1196 | } |
9cc50fc6 SL |
1197 | '\'' => { |
1198 | // Either a character constant 'a' OR a lifetime name 'abc | |
1199 | let start_with_quote = self.last_pos; | |
1200 | self.bump(); | |
1201 | let start = self.last_pos; | |
1a4d82fc | 1202 | |
9cc50fc6 SL |
1203 | // the eof will be picked up by the final `'` check below |
1204 | let c2 = self.curr.unwrap_or('\x00'); | |
1205 | self.bump(); | |
1a4d82fc | 1206 | |
9cc50fc6 SL |
1207 | // If the character is an ident start not followed by another single |
1208 | // quote, then this is a lifetime name: | |
1209 | if ident_start(Some(c2)) && !self.curr_is('\'') { | |
1210 | while ident_continue(self.curr) { | |
1211 | self.bump(); | |
1212 | } | |
1213 | // lifetimes shouldn't end with a single quote | |
1214 | // if we find one, then this is an invalid character literal | |
1215 | if self.curr_is('\'') { | |
1216 | panic!(self.fatal_span_verbose( | |
1217 | start_with_quote, self.pos, | |
1218 | String::from("character literal may only contain one codepoint"))); | |
1a4d82fc | 1219 | |
9cc50fc6 | 1220 | } |
1a4d82fc | 1221 | |
9cc50fc6 SL |
1222 | // Include the leading `'` in the real identifier, for macro |
1223 | // expansion purposes. See #12512 for the gory details of why | |
1224 | // this is necessary. | |
1225 | let ident = self.with_str_from(start, |lifetime_name| { | |
1226 | str_to_ident(&format!("'{}", lifetime_name)) | |
1227 | }); | |
1228 | ||
1229 | // Conjure up a "keyword checking ident" to make sure that | |
1230 | // the lifetime name is not a keyword. | |
1231 | let keyword_checking_ident = self.with_str_from(start, |lifetime_name| { | |
1a4d82fc JJ |
1232 | str_to_ident(lifetime_name) |
1233 | }); | |
9cc50fc6 SL |
1234 | let keyword_checking_token = &token::Ident(keyword_checking_ident, |
1235 | token::Plain); | |
1236 | let last_bpos = self.last_pos; | |
1237 | if keyword_checking_token.is_keyword(token::keywords::SelfValue) { | |
1238 | self.err_span_(start, | |
1239 | last_bpos, | |
1240 | "invalid lifetime name: 'self is no longer a special \ | |
1241 | lifetime"); | |
1242 | } else if keyword_checking_token.is_any_keyword() && | |
1243 | !keyword_checking_token.is_keyword(token::keywords::Static) { | |
1244 | self.err_span_(start, last_bpos, "invalid lifetime name"); | |
1245 | } | |
1246 | ||
1247 | return token::Lifetime(ident); | |
1a4d82fc | 1248 | } |
1a4d82fc | 1249 | |
9cc50fc6 SL |
1250 | let valid = self.scan_char_or_byte(start, |
1251 | c2, | |
1252 | // ascii_only = | |
1253 | false, | |
1254 | '\''); | |
92a42be0 | 1255 | |
9cc50fc6 SL |
1256 | if !self.curr_is('\'') { |
1257 | panic!(self.fatal_span_verbose( | |
1258 | start_with_quote, self.last_pos, | |
1259 | String::from("character literal may only contain one codepoint"))); | |
1a4d82fc JJ |
1260 | } |
1261 | ||
9cc50fc6 SL |
1262 | let id = if valid { |
1263 | self.name_from(start) | |
1264 | } else { | |
1265 | token::intern("0") | |
1266 | }; | |
1267 | self.bump(); // advance curr past token | |
1268 | let suffix = self.scan_optional_raw_name(); | |
1269 | return token::Literal(token::Char(id), suffix); | |
1a4d82fc | 1270 | } |
9cc50fc6 | 1271 | 'b' => { |
1a4d82fc | 1272 | self.bump(); |
9cc50fc6 SL |
1273 | let lit = match self.curr { |
1274 | Some('\'') => self.scan_byte(), | |
1275 | Some('"') => self.scan_byte_string(), | |
1276 | Some('r') => self.scan_raw_byte_string(), | |
1277 | _ => unreachable!(), // Should have been a token::Ident above. | |
1278 | }; | |
1279 | let suffix = self.scan_optional_raw_name(); | |
1280 | return token::Literal(lit, suffix); | |
1a4d82fc | 1281 | } |
9cc50fc6 SL |
1282 | '"' => { |
1283 | let start_bpos = self.last_pos; | |
1284 | let mut valid = true; | |
1285 | self.bump(); | |
1286 | while !self.curr_is('"') { | |
1287 | if self.is_eof() { | |
1288 | let last_bpos = self.last_pos; | |
1289 | panic!(self.fatal_span_(start_bpos, | |
1290 | last_bpos, | |
1291 | "unterminated double quote string")); | |
1292 | } | |
1a4d82fc | 1293 | |
9cc50fc6 SL |
1294 | let ch_start = self.last_pos; |
1295 | let ch = self.curr.unwrap(); | |
1296 | self.bump(); | |
1297 | valid &= self.scan_char_or_byte(ch_start, | |
1298 | ch, | |
1299 | // ascii_only = | |
1300 | false, | |
1301 | '"'); | |
1302 | } | |
1303 | // adjust for the ASCII " at the start of the literal | |
1304 | let id = if valid { | |
1305 | self.name_from(start_bpos + BytePos(1)) | |
1306 | } else { | |
1307 | token::intern("??") | |
1308 | }; | |
1309 | self.bump(); | |
1310 | let suffix = self.scan_optional_raw_name(); | |
1311 | return token::Literal(token::Str_(id), suffix); | |
1a4d82fc | 1312 | } |
9cc50fc6 SL |
1313 | 'r' => { |
1314 | let start_bpos = self.last_pos; | |
1315 | self.bump(); | |
1316 | let mut hash_count = 0; | |
1317 | while self.curr_is('#') { | |
1318 | self.bump(); | |
1319 | hash_count += 1; | |
1320 | } | |
1321 | ||
1a4d82fc JJ |
1322 | if self.is_eof() { |
1323 | let last_bpos = self.last_pos; | |
92a42be0 | 1324 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); |
9cc50fc6 SL |
1325 | } else if !self.curr_is('"') { |
1326 | let last_bpos = self.last_pos; | |
1327 | let curr_char = self.curr.unwrap(); | |
1328 | panic!(self.fatal_span_char(start_bpos, | |
1329 | last_bpos, | |
1330 | "found invalid character; only `#` is allowed \ | |
1331 | in raw string delimitation", | |
1332 | curr_char)); | |
1a4d82fc | 1333 | } |
9cc50fc6 SL |
1334 | self.bump(); |
1335 | let content_start_bpos = self.last_pos; | |
1336 | let mut content_end_bpos; | |
1337 | let mut valid = true; | |
1338 | 'outer: loop { | |
1339 | if self.is_eof() { | |
1340 | let last_bpos = self.last_pos; | |
1341 | panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string")); | |
1342 | } | |
1343 | // if self.curr_is('"') { | |
1344 | // content_end_bpos = self.last_pos; | |
1345 | // for _ in 0..hash_count { | |
1346 | // self.bump(); | |
1347 | // if !self.curr_is('#') { | |
1348 | // continue 'outer; | |
1349 | let c = self.curr.unwrap(); | |
1350 | match c { | |
1351 | '"' => { | |
1352 | content_end_bpos = self.last_pos; | |
1353 | for _ in 0..hash_count { | |
1354 | self.bump(); | |
1355 | if !self.curr_is('#') { | |
1356 | continue 'outer; | |
1357 | } | |
1a4d82fc | 1358 | } |
9cc50fc6 | 1359 | break; |
1a4d82fc | 1360 | } |
9cc50fc6 SL |
1361 | '\r' => { |
1362 | if !self.nextch_is('\n') { | |
1363 | let last_bpos = self.last_pos; | |
1364 | self.err_span_(start_bpos, | |
1365 | last_bpos, | |
1366 | "bare CR not allowed in raw string, use \\r \ | |
1367 | instead"); | |
1368 | valid = false; | |
1369 | } | |
1a4d82fc | 1370 | } |
9cc50fc6 | 1371 | _ => (), |
1a4d82fc | 1372 | } |
9cc50fc6 | 1373 | self.bump(); |
1a4d82fc JJ |
1374 | } |
1375 | self.bump(); | |
9cc50fc6 SL |
1376 | let id = if valid { |
1377 | self.name_from_to(content_start_bpos, content_end_bpos) | |
1378 | } else { | |
1379 | token::intern("??") | |
1380 | }; | |
1381 | let suffix = self.scan_optional_raw_name(); | |
1382 | return token::Literal(token::StrRaw(id, hash_count), suffix); | |
1383 | } | |
1384 | '-' => { | |
1385 | if self.nextch_is('>') { | |
1386 | self.bump(); | |
1387 | self.bump(); | |
1388 | return token::RArrow; | |
1389 | } else { | |
1390 | return self.binop(token::Minus); | |
1391 | } | |
1392 | } | |
1393 | '&' => { | |
1394 | if self.nextch_is('&') { | |
1395 | self.bump(); | |
1396 | self.bump(); | |
1397 | return token::AndAnd; | |
1398 | } else { | |
1399 | return self.binop(token::And); | |
1400 | } | |
1401 | } | |
1402 | '|' => { | |
1403 | match self.nextch() { | |
1404 | Some('|') => { | |
1405 | self.bump(); | |
1406 | self.bump(); | |
1407 | return token::OrOr; | |
1408 | } | |
1409 | _ => { | |
1410 | return self.binop(token::Or); | |
1411 | } | |
1412 | } | |
1413 | } | |
1414 | '+' => { | |
1415 | return self.binop(token::Plus); | |
1416 | } | |
1417 | '*' => { | |
1418 | return self.binop(token::Star); | |
1419 | } | |
1420 | '/' => { | |
1421 | return self.binop(token::Slash); | |
1422 | } | |
1423 | '^' => { | |
1424 | return self.binop(token::Caret); | |
1425 | } | |
1426 | '%' => { | |
1427 | return self.binop(token::Percent); | |
1428 | } | |
1429 | c => { | |
1430 | let last_bpos = self.last_pos; | |
1431 | let bpos = self.pos; | |
1432 | let mut err = self.struct_fatal_span_char(last_bpos, | |
1433 | bpos, | |
1434 | "unknown start of token", | |
1435 | c); | |
1436 | unicode_chars::check_for_substitution(&self, c, &mut err); | |
1437 | err.emit(); | |
1438 | panic!(FatalError); | |
1a4d82fc | 1439 | } |
1a4d82fc JJ |
1440 | } |
1441 | } | |
1442 | ||
1443 | fn consume_whitespace(&mut self) { | |
54a0048b | 1444 | while is_pattern_whitespace(self.curr) && !self.is_eof() { |
9cc50fc6 SL |
1445 | self.bump(); |
1446 | } | |
1a4d82fc JJ |
1447 | } |
1448 | ||
1449 | fn read_to_eol(&mut self) -> String { | |
1450 | let mut val = String::new(); | |
1451 | while !self.curr_is('\n') && !self.is_eof() { | |
1452 | val.push(self.curr.unwrap()); | |
1453 | self.bump(); | |
1454 | } | |
9cc50fc6 SL |
1455 | if self.curr_is('\n') { |
1456 | self.bump(); | |
1457 | } | |
1458 | return val; | |
1a4d82fc JJ |
1459 | } |
1460 | ||
1461 | fn read_one_line_comment(&mut self) -> String { | |
1462 | let val = self.read_to_eol(); | |
9cc50fc6 SL |
1463 | assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') || |
1464 | (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!')); | |
1a4d82fc JJ |
1465 | return val; |
1466 | } | |
1467 | ||
1468 | fn consume_non_eol_whitespace(&mut self) { | |
54a0048b | 1469 | while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() { |
1a4d82fc JJ |
1470 | self.bump(); |
1471 | } | |
1472 | } | |
1473 | ||
1474 | fn peeking_at_comment(&self) -> bool { | |
9cc50fc6 SL |
1475 | (self.curr_is('/') && self.nextch_is('/')) || (self.curr_is('/') && self.nextch_is('*')) || |
1476 | // consider shebangs comments, but not inner attributes | |
1477 | (self.curr_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) | |
1a4d82fc JJ |
1478 | } |
1479 | ||
1480 | fn scan_byte(&mut self) -> token::Lit { | |
1481 | self.bump(); | |
1482 | let start = self.last_pos; | |
1483 | ||
1484 | // the eof will be picked up by the final `'` check below | |
1485 | let c2 = self.curr.unwrap_or('\x00'); | |
1486 | self.bump(); | |
1487 | ||
9cc50fc6 SL |
1488 | let valid = self.scan_char_or_byte(start, |
1489 | c2, | |
1490 | // ascii_only = | |
1491 | true, | |
1492 | '\''); | |
1a4d82fc JJ |
1493 | if !self.curr_is('\'') { |
1494 | // Byte offsetting here is okay because the | |
1495 | // character before position `start` are an | |
1496 | // ascii single quote and ascii 'b'. | |
1497 | let last_pos = self.last_pos; | |
9cc50fc6 SL |
1498 | panic!(self.fatal_span_verbose(start - BytePos(2), |
1499 | last_pos, | |
1500 | "unterminated byte constant".to_string())); | |
1a4d82fc JJ |
1501 | } |
1502 | ||
9cc50fc6 SL |
1503 | let id = if valid { |
1504 | self.name_from(start) | |
1505 | } else { | |
1506 | token::intern("?") | |
1507 | }; | |
1a4d82fc JJ |
1508 | self.bump(); // advance curr past token |
1509 | return token::Byte(id); | |
1510 | } | |
1511 | ||
1512 | fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool { | |
1513 | self.scan_hex_digits(2, delim, below_0x7f_only) | |
1514 | } | |
1515 | ||
1516 | fn scan_byte_string(&mut self) -> token::Lit { | |
1517 | self.bump(); | |
1518 | let start = self.last_pos; | |
1519 | let mut valid = true; | |
1520 | ||
1521 | while !self.curr_is('"') { | |
1522 | if self.is_eof() { | |
1523 | let last_pos = self.last_pos; | |
92a42be0 | 1524 | panic!(self.fatal_span_(start, last_pos, "unterminated double quote byte string")); |
1a4d82fc JJ |
1525 | } |
1526 | ||
1527 | let ch_start = self.last_pos; | |
1528 | let ch = self.curr.unwrap(); | |
1529 | self.bump(); | |
9cc50fc6 SL |
1530 | valid &= self.scan_char_or_byte(ch_start, |
1531 | ch, | |
1532 | // ascii_only = | |
1533 | true, | |
1534 | '"'); | |
1a4d82fc | 1535 | } |
9cc50fc6 SL |
1536 | let id = if valid { |
1537 | self.name_from(start) | |
1538 | } else { | |
1539 | token::intern("??") | |
1540 | }; | |
1a4d82fc | 1541 | self.bump(); |
e9174d1e | 1542 | return token::ByteStr(id); |
1a4d82fc JJ |
1543 | } |
1544 | ||
1545 | fn scan_raw_byte_string(&mut self) -> token::Lit { | |
1546 | let start_bpos = self.last_pos; | |
1547 | self.bump(); | |
85aaf69f | 1548 | let mut hash_count = 0; |
1a4d82fc JJ |
1549 | while self.curr_is('#') { |
1550 | self.bump(); | |
1551 | hash_count += 1; | |
1552 | } | |
1553 | ||
1554 | if self.is_eof() { | |
1555 | let last_pos = self.last_pos; | |
92a42be0 | 1556 | panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string")); |
1a4d82fc JJ |
1557 | } else if !self.curr_is('"') { |
1558 | let last_pos = self.last_pos; | |
1559 | let ch = self.curr.unwrap(); | |
9cc50fc6 SL |
1560 | panic!(self.fatal_span_char(start_bpos, |
1561 | last_pos, | |
1562 | "found invalid character; only `#` is allowed in raw \ | |
1563 | string delimitation", | |
1564 | ch)); | |
1a4d82fc JJ |
1565 | } |
1566 | self.bump(); | |
1567 | let content_start_bpos = self.last_pos; | |
1568 | let mut content_end_bpos; | |
1569 | 'outer: loop { | |
1570 | match self.curr { | |
1571 | None => { | |
1572 | let last_pos = self.last_pos; | |
92a42be0 | 1573 | panic!(self.fatal_span_(start_bpos, last_pos, "unterminated raw string")) |
9cc50fc6 | 1574 | } |
1a4d82fc JJ |
1575 | Some('"') => { |
1576 | content_end_bpos = self.last_pos; | |
85aaf69f | 1577 | for _ in 0..hash_count { |
1a4d82fc JJ |
1578 | self.bump(); |
1579 | if !self.curr_is('#') { | |
1580 | continue 'outer; | |
1581 | } | |
1582 | } | |
1583 | break; | |
9cc50fc6 SL |
1584 | } |
1585 | Some(c) => { | |
1586 | if c > '\x7F' { | |
1587 | let last_pos = self.last_pos; | |
1588 | self.err_span_char(last_pos, last_pos, "raw byte string must be ASCII", c); | |
1589 | } | |
1a4d82fc JJ |
1590 | } |
1591 | } | |
1592 | self.bump(); | |
1593 | } | |
1594 | self.bump(); | |
9cc50fc6 SL |
1595 | return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos), |
1596 | hash_count); | |
1a4d82fc JJ |
1597 | } |
1598 | } | |
1599 | ||
54a0048b SL |
1600 | // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which |
1601 | // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3 | |
1602 | pub fn is_pattern_whitespace(c: Option<char>) -> bool { | |
1603 | c.map_or(false, Pattern_White_Space) | |
1a4d82fc JJ |
1604 | } |
1605 | ||
1606 | fn in_range(c: Option<char>, lo: char, hi: char) -> bool { | |
1607 | match c { | |
1608 | Some(c) => lo <= c && c <= hi, | |
9cc50fc6 | 1609 | _ => false, |
1a4d82fc JJ |
1610 | } |
1611 | } | |
1612 | ||
9cc50fc6 SL |
1613 | fn is_dec_digit(c: Option<char>) -> bool { |
1614 | return in_range(c, '0', '9'); | |
1615 | } | |
1a4d82fc JJ |
1616 | |
1617 | pub fn is_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1618 | let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') || |
1619 | s.starts_with("//!"); | |
1a4d82fc JJ |
1620 | debug!("is {:?} a doc comment? {}", s, res); |
1621 | res | |
1622 | } | |
1623 | ||
1624 | pub fn is_block_doc_comment(s: &str) -> bool { | |
9cc50fc6 SL |
1625 | // Prevent `/**/` from being parsed as a doc comment |
1626 | let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') || | |
1627 | s.starts_with("/*!")) && s.len() >= 5; | |
1a4d82fc JJ |
1628 | debug!("is {:?} a doc comment? {}", s, res); |
1629 | res | |
1630 | } | |
1631 | ||
1632 | fn ident_start(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1633 | let c = match c { |
1634 | Some(c) => c, | |
1635 | None => return false, | |
1636 | }; | |
1a4d82fc | 1637 | |
9cc50fc6 | 1638 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start()) |
1a4d82fc JJ |
1639 | } |
1640 | ||
1641 | fn ident_continue(c: Option<char>) -> bool { | |
9cc50fc6 SL |
1642 | let c = match c { |
1643 | Some(c) => c, | |
1644 | None => return false, | |
1645 | }; | |
1a4d82fc | 1646 | |
9cc50fc6 SL |
1647 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || |
1648 | (c > '\x7f' && c.is_xid_continue()) | |
1a4d82fc JJ |
1649 | } |
1650 | ||
1651 | #[cfg(test)] | |
d9579d0f | 1652 | mod tests { |
1a4d82fc JJ |
1653 | use super::*; |
1654 | ||
1655 | use codemap::{BytePos, CodeMap, Span, NO_EXPANSION}; | |
9cc50fc6 | 1656 | use errors; |
1a4d82fc | 1657 | use parse::token; |
9cc50fc6 | 1658 | use parse::token::str_to_ident; |
c34b1796 | 1659 | use std::io; |
9cc50fc6 | 1660 | use std::rc::Rc; |
1a4d82fc | 1661 | |
9cc50fc6 | 1662 | fn mk_sh(cm: Rc<CodeMap>) -> errors::Handler { |
c34b1796 | 1663 | // FIXME (#22405): Replace `Box::new` with `box` here when/if possible. |
9cc50fc6 SL |
1664 | let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), None, cm); |
1665 | errors::Handler::with_emitter(true, false, Box::new(emitter)) | |
1a4d82fc JJ |
1666 | } |
1667 | ||
1668 | // open a string reader for the given string | |
9cc50fc6 SL |
1669 | fn setup<'a>(cm: &CodeMap, |
1670 | span_handler: &'a errors::Handler, | |
1671 | teststr: String) | |
1672 | -> StringReader<'a> { | |
1673 | let fm = cm.new_filemap("zebra.rs".to_string(), teststr); | |
1a4d82fc JJ |
1674 | StringReader::new(span_handler, fm) |
1675 | } | |
1676 | ||
9cc50fc6 SL |
1677 | #[test] |
1678 | fn t1() { | |
1679 | let cm = Rc::new(CodeMap::new()); | |
1680 | let sh = mk_sh(cm.clone()); | |
1681 | let mut string_reader = setup(&cm, | |
1682 | &sh, | |
1683 | "/* my source file */ fn main() { println!(\"zebra\"); }\n" | |
1684 | .to_string()); | |
1a4d82fc JJ |
1685 | let id = str_to_ident("fn"); |
1686 | assert_eq!(string_reader.next_token().tok, token::Comment); | |
1687 | assert_eq!(string_reader.next_token().tok, token::Whitespace); | |
1688 | let tok1 = string_reader.next_token(); | |
9cc50fc6 SL |
1689 | let tok2 = TokenAndSpan { |
1690 | tok: token::Ident(id, token::Plain), | |
1691 | sp: Span { | |
1692 | lo: BytePos(21), | |
1693 | hi: BytePos(23), | |
1694 | expn_id: NO_EXPANSION, | |
1695 | }, | |
1696 | }; | |
1697 | assert_eq!(tok1, tok2); | |
1a4d82fc JJ |
1698 | assert_eq!(string_reader.next_token().tok, token::Whitespace); |
1699 | // the 'main' id is already read: | |
1700 | assert_eq!(string_reader.last_pos.clone(), BytePos(28)); | |
1701 | // read another token: | |
1702 | let tok3 = string_reader.next_token(); | |
9cc50fc6 SL |
1703 | let tok4 = TokenAndSpan { |
1704 | tok: token::Ident(str_to_ident("main"), token::Plain), | |
1705 | sp: Span { | |
1706 | lo: BytePos(24), | |
1707 | hi: BytePos(28), | |
1708 | expn_id: NO_EXPANSION, | |
1709 | }, | |
1710 | }; | |
1711 | assert_eq!(tok3, tok4); | |
1a4d82fc JJ |
1712 | // the lparen is already read: |
1713 | assert_eq!(string_reader.last_pos.clone(), BytePos(29)) | |
1714 | } | |
1715 | ||
1716 | // check that the given reader produces the desired stream | |
1717 | // of tokens (stop checking after exhausting the expected vec) | |
9cc50fc6 | 1718 | fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) { |
85aaf69f | 1719 | for expected_tok in &expected { |
1a4d82fc JJ |
1720 | assert_eq!(&string_reader.next_token().tok, expected_tok); |
1721 | } | |
1722 | } | |
1723 | ||
1724 | // make the identifier by looking up the string in the interner | |
1725 | fn mk_ident(id: &str, style: token::IdentStyle) -> token::Token { | |
1726 | token::Ident(str_to_ident(id), style) | |
1727 | } | |
1728 | ||
9cc50fc6 SL |
1729 | #[test] |
1730 | fn doublecolonparsing() { | |
1731 | let cm = Rc::new(CodeMap::new()); | |
1732 | let sh = mk_sh(cm.clone()); | |
1733 | check_tokenization(setup(&cm, &sh, "a b".to_string()), | |
1a4d82fc JJ |
1734 | vec![mk_ident("a", token::Plain), |
1735 | token::Whitespace, | |
1736 | mk_ident("b", token::Plain)]); | |
1737 | } | |
1738 | ||
9cc50fc6 SL |
1739 | #[test] |
1740 | fn dcparsing_2() { | |
1741 | let cm = Rc::new(CodeMap::new()); | |
1742 | let sh = mk_sh(cm.clone()); | |
1743 | check_tokenization(setup(&cm, &sh, "a::b".to_string()), | |
1744 | vec![mk_ident("a", token::ModName), | |
1a4d82fc JJ |
1745 | token::ModSep, |
1746 | mk_ident("b", token::Plain)]); | |
1747 | } | |
1748 | ||
9cc50fc6 SL |
1749 | #[test] |
1750 | fn dcparsing_3() { | |
1751 | let cm = Rc::new(CodeMap::new()); | |
1752 | let sh = mk_sh(cm.clone()); | |
1753 | check_tokenization(setup(&cm, &sh, "a ::b".to_string()), | |
1a4d82fc JJ |
1754 | vec![mk_ident("a", token::Plain), |
1755 | token::Whitespace, | |
1756 | token::ModSep, | |
1757 | mk_ident("b", token::Plain)]); | |
1758 | } | |
1759 | ||
9cc50fc6 SL |
1760 | #[test] |
1761 | fn dcparsing_4() { | |
1762 | let cm = Rc::new(CodeMap::new()); | |
1763 | let sh = mk_sh(cm.clone()); | |
1764 | check_tokenization(setup(&cm, &sh, "a:: b".to_string()), | |
1765 | vec![mk_ident("a", token::ModName), | |
1a4d82fc JJ |
1766 | token::ModSep, |
1767 | token::Whitespace, | |
1768 | mk_ident("b", token::Plain)]); | |
1769 | } | |
1770 | ||
9cc50fc6 SL |
1771 | #[test] |
1772 | fn character_a() { | |
1773 | let cm = Rc::new(CodeMap::new()); | |
1774 | let sh = mk_sh(cm.clone()); | |
1775 | assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok, | |
1a4d82fc JJ |
1776 | token::Literal(token::Char(token::intern("a")), None)); |
1777 | } | |
1778 | ||
9cc50fc6 SL |
1779 | #[test] |
1780 | fn character_space() { | |
1781 | let cm = Rc::new(CodeMap::new()); | |
1782 | let sh = mk_sh(cm.clone()); | |
1783 | assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok, | |
1a4d82fc JJ |
1784 | token::Literal(token::Char(token::intern(" ")), None)); |
1785 | } | |
1786 | ||
9cc50fc6 SL |
1787 | #[test] |
1788 | fn character_escaped() { | |
1789 | let cm = Rc::new(CodeMap::new()); | |
1790 | let sh = mk_sh(cm.clone()); | |
1791 | assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok, | |
1a4d82fc JJ |
1792 | token::Literal(token::Char(token::intern("\\n")), None)); |
1793 | } | |
1794 | ||
9cc50fc6 SL |
1795 | #[test] |
1796 | fn lifetime_name() { | |
1797 | let cm = Rc::new(CodeMap::new()); | |
1798 | let sh = mk_sh(cm.clone()); | |
1799 | assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok, | |
1a4d82fc JJ |
1800 | token::Lifetime(token::str_to_ident("'abc"))); |
1801 | } | |
1802 | ||
9cc50fc6 SL |
1803 | #[test] |
1804 | fn raw_string() { | |
1805 | let cm = Rc::new(CodeMap::new()); | |
1806 | let sh = mk_sh(cm.clone()); | |
1807 | assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string()) | |
1808 | .next_token() | |
1809 | .tok, | |
1a4d82fc JJ |
1810 | token::Literal(token::StrRaw(token::intern("\"#a\\b\x00c\""), 3), None)); |
1811 | } | |
1812 | ||
9cc50fc6 SL |
1813 | #[test] |
1814 | fn literal_suffixes() { | |
1815 | let cm = Rc::new(CodeMap::new()); | |
1816 | let sh = mk_sh(cm.clone()); | |
1a4d82fc JJ |
1817 | macro_rules! test { |
1818 | ($input: expr, $tok_type: ident, $tok_contents: expr) => {{ | |
9cc50fc6 | 1819 | assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok, |
1a4d82fc JJ |
1820 | token::Literal(token::$tok_type(token::intern($tok_contents)), |
1821 | Some(token::intern("suffix")))); | |
1822 | // with a whitespace separator: | |
9cc50fc6 | 1823 | assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok, |
1a4d82fc JJ |
1824 | token::Literal(token::$tok_type(token::intern($tok_contents)), |
1825 | None)); | |
1826 | }} | |
1827 | } | |
1828 | ||
1829 | test!("'a'", Char, "a"); | |
1830 | test!("b'a'", Byte, "a"); | |
1831 | test!("\"a\"", Str_, "a"); | |
e9174d1e | 1832 | test!("b\"a\"", ByteStr, "a"); |
1a4d82fc JJ |
1833 | test!("1234", Integer, "1234"); |
1834 | test!("0b101", Integer, "0b101"); | |
1835 | test!("0xABC", Integer, "0xABC"); | |
1836 | test!("1.0", Float, "1.0"); | |
1837 | test!("1.0e10", Float, "1.0e10"); | |
1838 | ||
9cc50fc6 | 1839 | assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok, |
1a4d82fc | 1840 | token::Literal(token::Integer(token::intern("2")), |
85aaf69f | 1841 | Some(token::intern("us")))); |
9cc50fc6 | 1842 | assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok, |
1a4d82fc JJ |
1843 | token::Literal(token::StrRaw(token::intern("raw"), 3), |
1844 | Some(token::intern("suffix")))); | |
9cc50fc6 | 1845 | assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok, |
e9174d1e | 1846 | token::Literal(token::ByteStrRaw(token::intern("raw"), 3), |
1a4d82fc JJ |
1847 | Some(token::intern("suffix")))); |
1848 | } | |
1849 | ||
9cc50fc6 SL |
1850 | #[test] |
1851 | fn line_doc_comments() { | |
1a4d82fc JJ |
1852 | assert!(is_doc_comment("///")); |
1853 | assert!(is_doc_comment("/// blah")); | |
1854 | assert!(!is_doc_comment("////")); | |
1855 | } | |
1856 | ||
9cc50fc6 SL |
1857 | #[test] |
1858 | fn nested_block_comments() { | |
1859 | let cm = Rc::new(CodeMap::new()); | |
1860 | let sh = mk_sh(cm.clone()); | |
1861 | let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string()); | |
1a4d82fc | 1862 | match lexer.next_token().tok { |
9cc50fc6 SL |
1863 | token::Comment => {} |
1864 | _ => panic!("expected a comment!"), | |
1a4d82fc | 1865 | } |
9cc50fc6 SL |
1866 | assert_eq!(lexer.next_token().tok, |
1867 | token::Literal(token::Char(token::intern("a")), None)); | |
1a4d82fc JJ |
1868 | } |
1869 | ||
9cc50fc6 SL |
1870 | #[test] |
1871 | fn crlf_comments() { | |
1872 | let cm = Rc::new(CodeMap::new()); | |
1873 | let sh = mk_sh(cm.clone()); | |
1874 | let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string()); | |
62682a34 SL |
1875 | let comment = lexer.next_token(); |
1876 | assert_eq!(comment.tok, token::Comment); | |
1877 | assert_eq!(comment.sp, ::codemap::mk_sp(BytePos(0), BytePos(7))); | |
1878 | assert_eq!(lexer.next_token().tok, token::Whitespace); | |
9cc50fc6 SL |
1879 | assert_eq!(lexer.next_token().tok, |
1880 | token::DocComment(token::intern("/// test"))); | |
62682a34 | 1881 | } |
1a4d82fc | 1882 | } |