1 //! Lexing `&str` into a sequence of Rust tokens.
3 //! Note that strictly speaking the parser in this crate is not required to work
4 //! on tokens which originated from text. Macros, eg, can synthesize tokens out
5 //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
6 //! convenient to include a text-based lexer here!
8 //! Note that these tokens, unlike the tokens we feed into the parser, do
9 //! include info about comments and whitespace.
14 SyntaxKind
::{self, *}
,
18 pub struct LexedStr
<'a
> {
20 kind
: Vec
<SyntaxKind
>,
30 impl<'a
> LexedStr
<'a
> {
31 pub fn new(text
: &'a
str) -> LexedStr
<'a
> {
32 let mut conv
= Converter
::new(text
);
33 if let Some(shebang_len
) = rustc_lexer
::strip_shebang(text
) {
34 conv
.res
.push(SHEBANG
, conv
.offset
);
35 conv
.offset
= shebang_len
;
38 for token
in rustc_lexer
::tokenize(&text
[conv
.offset
..]) {
39 let token_text
= &text
[conv
.offset
..][..token
.len
as usize];
41 conv
.extend_token(&token
.kind
, token_text
);
44 conv
.finalize_with_eof()
47 pub fn single_token(text
: &'a
str) -> Option
<(SyntaxKind
, Option
<String
>)> {
52 let token
= rustc_lexer
::tokenize(text
).next()?
;
53 if token
.len
as usize != text
.len() {
57 let mut conv
= Converter
::new(text
);
58 conv
.extend_token(&token
.kind
, text
);
59 match &*conv
.res
.kind
{
60 [kind
] => Some((*kind
, conv
.res
.error
.pop().map(|it
| it
.msg
))),
65 pub fn as_str(&self) -> &str {
69 pub fn len(&self) -> usize {
73 pub fn is_empty(&self) -> bool
{
77 pub fn kind(&self, i
: usize) -> SyntaxKind
{
78 assert
!(i
< self.len());
82 pub fn text(&self, i
: usize) -> &str {
83 self.range_text(i
..i
+ 1)
86 pub fn range_text(&self, r
: ops
::Range
<usize>) -> &str {
87 assert
!(r
.start
< r
.end
&& r
.end
<= self.len());
88 let lo
= self.start
[r
.start
] as usize;
89 let hi
= self.start
[r
.end
] as usize;
94 pub fn text_range(&self, i
: usize) -> ops
::Range
<usize> {
95 assert
!(i
< self.len());
96 let lo
= self.start
[i
] as usize;
97 let hi
= self.start
[i
+ 1] as usize;
100 pub fn text_start(&self, i
: usize) -> usize {
101 assert
!(i
<= self.len());
102 self.start
[i
] as usize
104 pub fn text_len(&self, i
: usize) -> usize {
105 assert
!(i
< self.len());
106 let r
= self.text_range(i
);
110 pub fn error(&self, i
: usize) -> Option
<&str> {
111 assert
!(i
< self.len());
112 let err
= self.error
.binary_search_by_key(&(i
as u32), |i
| i
.token
).ok()?
;
113 Some(self.error
[err
].msg
.as_str())
116 pub fn errors(&self) -> impl Iterator
<Item
= (usize, &str)> + '_
{
117 self.error
.iter().map(|it
| (it
.token
as usize, it
.msg
.as_str()))
120 fn push(&mut self, kind
: SyntaxKind
, offset
: usize) {
121 self.kind
.push(kind
);
122 self.start
.push(offset
as u32);
126 struct Converter
<'a
> {
131 impl<'a
> Converter
<'a
> {
132 fn new(text
: &'a
str) -> Self {
134 res
: LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() }
,
139 fn finalize_with_eof(mut self) -> LexedStr
<'a
> {
140 self.res
.push(EOF
, self.offset
);
144 fn push(&mut self, kind
: SyntaxKind
, len
: usize, err
: Option
<&str>) {
145 self.res
.push(kind
, self.offset
);
148 if let Some(err
) = err
{
149 let token
= self.res
.len() as u32;
150 let msg
= err
.to_string();
151 self.res
.error
.push(LexError { msg, token }
);
155 fn extend_token(&mut self, kind
: &rustc_lexer
::TokenKind
, token_text
: &str) {
156 // A note on an intended tradeoff:
157 // We drop some useful information here (see patterns with double dots `..`)
158 // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
159 // being `u16` that come from `rowan::SyntaxKind`.
164 rustc_lexer
::TokenKind
::LineComment { doc_style: _ }
=> COMMENT
,
165 rustc_lexer
::TokenKind
::BlockComment { doc_style: _, terminated }
=> {
167 err
= "Missing trailing `*/` symbols to terminate the block comment";
172 rustc_lexer
::TokenKind
::Whitespace
=> WHITESPACE
,
174 rustc_lexer
::TokenKind
::Ident
if token_text
== "_" => UNDERSCORE
,
175 rustc_lexer
::TokenKind
::Ident
=> {
176 SyntaxKind
::from_keyword(token_text
).unwrap_or(IDENT
)
178 rustc_lexer
::TokenKind
::InvalidIdent
=> {
179 err
= "Ident contains invalid characters";
183 rustc_lexer
::TokenKind
::RawIdent
=> IDENT
,
184 rustc_lexer
::TokenKind
::Literal { kind, .. }
=> {
185 self.extend_literal(token_text
.len(), kind
);
189 rustc_lexer
::TokenKind
::Lifetime { starts_with_number }
=> {
190 if *starts_with_number
{
191 err
= "Lifetime name cannot start with a number";
196 rustc_lexer
::TokenKind
::Semi
=> T
![;],
197 rustc_lexer
::TokenKind
::Comma
=> T
![,],
198 rustc_lexer
::TokenKind
::Dot
=> T
![.],
199 rustc_lexer
::TokenKind
::OpenParen
=> T
!['
('
],
200 rustc_lexer
::TokenKind
::CloseParen
=> T
!['
)'
],
201 rustc_lexer
::TokenKind
::OpenBrace
=> T
!['
{'
],
202 rustc_lexer
::TokenKind
::CloseBrace
=> T
!['
}'
],
203 rustc_lexer
::TokenKind
::OpenBracket
=> T
!['
['
],
204 rustc_lexer
::TokenKind
::CloseBracket
=> T
!['
]'
],
205 rustc_lexer
::TokenKind
::At
=> T
![@
],
206 rustc_lexer
::TokenKind
::Pound
=> T
![#],
207 rustc_lexer
::TokenKind
::Tilde
=> T
![~],
208 rustc_lexer
::TokenKind
::Question
=> T
![?
],
209 rustc_lexer
::TokenKind
::Colon
=> T
![:],
210 rustc_lexer
::TokenKind
::Dollar
=> T
![$
],
211 rustc_lexer
::TokenKind
::Eq
=> T
![=],
212 rustc_lexer
::TokenKind
::Bang
=> T
![!],
213 rustc_lexer
::TokenKind
::Lt
=> T
![<],
214 rustc_lexer
::TokenKind
::Gt
=> T
![>],
215 rustc_lexer
::TokenKind
::Minus
=> T
![-],
216 rustc_lexer
::TokenKind
::And
=> T
![&],
217 rustc_lexer
::TokenKind
::Or
=> T
![|],
218 rustc_lexer
::TokenKind
::Plus
=> T
![+],
219 rustc_lexer
::TokenKind
::Star
=> T
![*],
220 rustc_lexer
::TokenKind
::Slash
=> T
![/],
221 rustc_lexer
::TokenKind
::Caret
=> T
![^
],
222 rustc_lexer
::TokenKind
::Percent
=> T
![%],
223 rustc_lexer
::TokenKind
::Unknown
=> ERROR
,
224 rustc_lexer
::TokenKind
::UnknownPrefix
if token_text
== "builtin" => IDENT
,
225 rustc_lexer
::TokenKind
::UnknownPrefix
=> {
226 err
= "unknown literal prefix";
229 rustc_lexer
::TokenKind
::Eof
=> EOF
,
233 let err
= if err
.is_empty() { None }
else { Some(err) }
;
234 self.push(syntax_kind
, token_text
.len(), err
);
237 fn extend_literal(&mut self, len
: usize, kind
: &rustc_lexer
::LiteralKind
) {
240 let syntax_kind
= match *kind
{
241 rustc_lexer
::LiteralKind
::Int { empty_int, base: _ }
=> {
243 err
= "Missing digits after the integer base prefix";
247 rustc_lexer
::LiteralKind
::Float { empty_exponent, base: _ }
=> {
249 err
= "Missing digits after the exponent symbol";
253 rustc_lexer
::LiteralKind
::Char { terminated }
=> {
255 err
= "Missing trailing `'` symbol to terminate the character literal";
259 rustc_lexer
::LiteralKind
::Byte { terminated }
=> {
261 err
= "Missing trailing `'` symbol to terminate the byte literal";
265 rustc_lexer
::LiteralKind
::Str { terminated }
=> {
267 err
= "Missing trailing `\"` symbol to terminate the string literal";
271 rustc_lexer
::LiteralKind
::ByteStr { terminated }
=> {
273 err
= "Missing trailing `\"` symbol to terminate the byte string literal";
277 rustc_lexer
::LiteralKind
::CStr { terminated }
=> {
279 err
= "Missing trailing `\"` symbol to terminate the string literal";
283 rustc_lexer
::LiteralKind
::RawStr { n_hashes }
=> {
284 if n_hashes
.is_none() {
285 err
= "Invalid raw string literal";
289 rustc_lexer
::LiteralKind
::RawByteStr { n_hashes }
=> {
290 if n_hashes
.is_none() {
291 err
= "Invalid raw string literal";
295 rustc_lexer
::LiteralKind
::RawCStr { n_hashes }
=> {
296 if n_hashes
.is_none() {
297 err
= "Invalid raw string literal";
303 let err
= if err
.is_empty() { None }
else { Some(err) }
;
304 self.push(syntax_kind
, len
, err
);