]> git.proxmox.com Git - rustc.git/blob - src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
New upstream version 1.74.1+dfsg1
[rustc.git] / src / tools / rust-analyzer / crates / parser / src / lexed_str.rs
1 //! Lexing `&str` into a sequence of Rust tokens.
2 //!
3 //! Note that strictly speaking the parser in this crate is not required to work
4 //! on tokens which originated from text. Macros, eg, can synthesize tokens out
5 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6 //! convenient to include a text-based lexer here!
7 //!
8 //! Note that these tokens, unlike the tokens we feed into the parser, do
9 //! include info about comments and whitespace.
10
11 use std::ops;
12
13 use crate::{
14 SyntaxKind::{self, *},
15 T,
16 };
17
18 pub struct LexedStr<'a> {
19 text: &'a str,
20 kind: Vec<SyntaxKind>,
21 start: Vec<u32>,
22 error: Vec<LexError>,
23 }
24
25 struct LexError {
26 msg: String,
27 token: u32,
28 }
29
30 impl<'a> LexedStr<'a> {
31 pub fn new(text: &'a str) -> LexedStr<'a> {
32 let mut conv = Converter::new(text);
33 if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
34 conv.res.push(SHEBANG, conv.offset);
35 conv.offset = shebang_len;
36 };
37
38 for token in rustc_lexer::tokenize(&text[conv.offset..]) {
39 let token_text = &text[conv.offset..][..token.len as usize];
40
41 conv.extend_token(&token.kind, token_text);
42 }
43
44 conv.finalize_with_eof()
45 }
46
47 pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
48 if text.is_empty() {
49 return None;
50 }
51
52 let token = rustc_lexer::tokenize(text).next()?;
53 if token.len as usize != text.len() {
54 return None;
55 }
56
57 let mut conv = Converter::new(text);
58 conv.extend_token(&token.kind, text);
59 match &*conv.res.kind {
60 [kind] => Some((*kind, conv.res.error.pop().map(|it| it.msg))),
61 _ => None,
62 }
63 }
64
65 pub fn as_str(&self) -> &str {
66 self.text
67 }
68
69 pub fn len(&self) -> usize {
70 self.kind.len() - 1
71 }
72
73 pub fn is_empty(&self) -> bool {
74 self.len() == 0
75 }
76
77 pub fn kind(&self, i: usize) -> SyntaxKind {
78 assert!(i < self.len());
79 self.kind[i]
80 }
81
82 pub fn text(&self, i: usize) -> &str {
83 self.range_text(i..i + 1)
84 }
85
86 pub fn range_text(&self, r: ops::Range<usize>) -> &str {
87 assert!(r.start < r.end && r.end <= self.len());
88 let lo = self.start[r.start] as usize;
89 let hi = self.start[r.end] as usize;
90 &self.text[lo..hi]
91 }
92
93 // Naming is hard.
94 pub fn text_range(&self, i: usize) -> ops::Range<usize> {
95 assert!(i < self.len());
96 let lo = self.start[i] as usize;
97 let hi = self.start[i + 1] as usize;
98 lo..hi
99 }
100 pub fn text_start(&self, i: usize) -> usize {
101 assert!(i <= self.len());
102 self.start[i] as usize
103 }
104 pub fn text_len(&self, i: usize) -> usize {
105 assert!(i < self.len());
106 let r = self.text_range(i);
107 r.end - r.start
108 }
109
110 pub fn error(&self, i: usize) -> Option<&str> {
111 assert!(i < self.len());
112 let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
113 Some(self.error[err].msg.as_str())
114 }
115
116 pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
117 self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
118 }
119
120 fn push(&mut self, kind: SyntaxKind, offset: usize) {
121 self.kind.push(kind);
122 self.start.push(offset as u32);
123 }
124 }
125
126 struct Converter<'a> {
127 res: LexedStr<'a>,
128 offset: usize,
129 }
130
131 impl<'a> Converter<'a> {
132 fn new(text: &'a str) -> Self {
133 Self {
134 res: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() },
135 offset: 0,
136 }
137 }
138
139 fn finalize_with_eof(mut self) -> LexedStr<'a> {
140 self.res.push(EOF, self.offset);
141 self.res
142 }
143
144 fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) {
145 self.res.push(kind, self.offset);
146 self.offset += len;
147
148 if let Some(err) = err {
149 let token = self.res.len() as u32;
150 let msg = err.to_string();
151 self.res.error.push(LexError { msg, token });
152 }
153 }
154
155 fn extend_token(&mut self, kind: &rustc_lexer::TokenKind, token_text: &str) {
156 // A note on an intended tradeoff:
157 // We drop some useful information here (see patterns with double dots `..`)
158 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
159 // being `u16` that come from `rowan::SyntaxKind`.
160 let mut err = "";
161
162 let syntax_kind = {
163 match kind {
164 rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
165 rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
166 if !terminated {
167 err = "Missing trailing `*/` symbols to terminate the block comment";
168 }
169 COMMENT
170 }
171
172 rustc_lexer::TokenKind::Whitespace => WHITESPACE,
173
174 rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
175 rustc_lexer::TokenKind::Ident => {
176 SyntaxKind::from_keyword(token_text).unwrap_or(IDENT)
177 }
178 rustc_lexer::TokenKind::InvalidIdent => {
179 err = "Ident contains invalid characters";
180 IDENT
181 }
182
183 rustc_lexer::TokenKind::RawIdent => IDENT,
184 rustc_lexer::TokenKind::Literal { kind, .. } => {
185 self.extend_literal(token_text.len(), kind);
186 return;
187 }
188
189 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
190 if *starts_with_number {
191 err = "Lifetime name cannot start with a number";
192 }
193 LIFETIME_IDENT
194 }
195
196 rustc_lexer::TokenKind::Semi => T![;],
197 rustc_lexer::TokenKind::Comma => T![,],
198 rustc_lexer::TokenKind::Dot => T![.],
199 rustc_lexer::TokenKind::OpenParen => T!['('],
200 rustc_lexer::TokenKind::CloseParen => T![')'],
201 rustc_lexer::TokenKind::OpenBrace => T!['{'],
202 rustc_lexer::TokenKind::CloseBrace => T!['}'],
203 rustc_lexer::TokenKind::OpenBracket => T!['['],
204 rustc_lexer::TokenKind::CloseBracket => T![']'],
205 rustc_lexer::TokenKind::At => T![@],
206 rustc_lexer::TokenKind::Pound => T![#],
207 rustc_lexer::TokenKind::Tilde => T![~],
208 rustc_lexer::TokenKind::Question => T![?],
209 rustc_lexer::TokenKind::Colon => T![:],
210 rustc_lexer::TokenKind::Dollar => T![$],
211 rustc_lexer::TokenKind::Eq => T![=],
212 rustc_lexer::TokenKind::Bang => T![!],
213 rustc_lexer::TokenKind::Lt => T![<],
214 rustc_lexer::TokenKind::Gt => T![>],
215 rustc_lexer::TokenKind::Minus => T![-],
216 rustc_lexer::TokenKind::And => T![&],
217 rustc_lexer::TokenKind::Or => T![|],
218 rustc_lexer::TokenKind::Plus => T![+],
219 rustc_lexer::TokenKind::Star => T![*],
220 rustc_lexer::TokenKind::Slash => T![/],
221 rustc_lexer::TokenKind::Caret => T![^],
222 rustc_lexer::TokenKind::Percent => T![%],
223 rustc_lexer::TokenKind::Unknown => ERROR,
224 rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
225 rustc_lexer::TokenKind::UnknownPrefix => {
226 err = "unknown literal prefix";
227 IDENT
228 }
229 rustc_lexer::TokenKind::Eof => EOF,
230 }
231 };
232
233 let err = if err.is_empty() { None } else { Some(err) };
234 self.push(syntax_kind, token_text.len(), err);
235 }
236
237 fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
238 let mut err = "";
239
240 let syntax_kind = match *kind {
241 rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
242 if empty_int {
243 err = "Missing digits after the integer base prefix";
244 }
245 INT_NUMBER
246 }
247 rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
248 if empty_exponent {
249 err = "Missing digits after the exponent symbol";
250 }
251 FLOAT_NUMBER
252 }
253 rustc_lexer::LiteralKind::Char { terminated } => {
254 if !terminated {
255 err = "Missing trailing `'` symbol to terminate the character literal";
256 }
257 CHAR
258 }
259 rustc_lexer::LiteralKind::Byte { terminated } => {
260 if !terminated {
261 err = "Missing trailing `'` symbol to terminate the byte literal";
262 }
263 BYTE
264 }
265 rustc_lexer::LiteralKind::Str { terminated } => {
266 if !terminated {
267 err = "Missing trailing `\"` symbol to terminate the string literal";
268 }
269 STRING
270 }
271 rustc_lexer::LiteralKind::ByteStr { terminated } => {
272 if !terminated {
273 err = "Missing trailing `\"` symbol to terminate the byte string literal";
274 }
275 BYTE_STRING
276 }
277 rustc_lexer::LiteralKind::CStr { terminated } => {
278 if !terminated {
279 err = "Missing trailing `\"` symbol to terminate the string literal";
280 }
281 C_STRING
282 }
283 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
284 if n_hashes.is_none() {
285 err = "Invalid raw string literal";
286 }
287 STRING
288 }
289 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
290 if n_hashes.is_none() {
291 err = "Invalid raw string literal";
292 }
293 BYTE_STRING
294 }
295 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
296 if n_hashes.is_none() {
297 err = "Invalid raw string literal";
298 }
299 C_STRING
300 }
301 };
302
303 let err = if err.is_empty() { None } else { Some(err) };
304 self.push(syntax_kind, len, err);
305 }
306 }