]>
git.proxmox.com Git - rustc.git/blob - src/llvm/lib/MC/MCParser/AsmLexer.cpp
1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This class implements the lexer for assembly files.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/MC/MCParser/AsmLexer.h"
15 #include "llvm/MC/MCAsmInfo.h"
16 #include "llvm/Support/MemoryBuffer.h"
17 #include "llvm/Support/SMLoc.h"
24 AsmLexer::AsmLexer(const MCAsmInfo
&_MAI
) : MAI(_MAI
) {
26 isAtStartOfLine
= true;
27 AllowAtInIdentifier
= !StringRef(MAI
.getCommentString()).startswith("@");
30 AsmLexer::~AsmLexer() {
33 void AsmLexer::setBuffer(StringRef Buf
, const char *ptr
) {
39 CurPtr
= CurBuf
.begin();
44 /// ReturnError - Set the error to the specified string at the specified
45 /// location. This is defined to always return AsmToken::Error.
46 AsmToken
AsmLexer::ReturnError(const char *Loc
, const std::string
&Msg
) {
47 SetError(SMLoc::getFromPointer(Loc
), Msg
);
49 return AsmToken(AsmToken::Error
, StringRef(Loc
, 0));
52 int AsmLexer::getNextChar() {
53 char CurChar
= *CurPtr
++;
56 return (unsigned char)CurChar
;
58 // A nul character in the stream is either the end of the current buffer or
59 // a random nul in the file. Disambiguate that here.
60 if (CurPtr
- 1 != CurBuf
.end())
61 return 0; // Just whitespace.
63 // Otherwise, return end of file.
64 --CurPtr
; // Another call to lex will return EOF again.
69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
71 /// The leading integral digit sequence and dot should have already been
72 /// consumed, some or all of the fractional digit sequence *can* have been
74 AsmToken
AsmLexer::LexFloatLiteral() {
75 // Skip the fractional digit sequence.
76 while (isdigit(*CurPtr
))
79 // Check for exponent; we intentionally accept a slighlty wider set of
80 // literals here and rely on the upstream client to reject invalid ones (e.g.,
82 if (*CurPtr
== 'e' || *CurPtr
== 'E') {
84 if (*CurPtr
== '-' || *CurPtr
== '+')
86 while (isdigit(*CurPtr
))
90 return AsmToken(AsmToken::Real
,
91 StringRef(TokStart
, CurPtr
- TokStart
));
94 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95 /// while making sure there are enough actual digits around for the constant to
98 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99 /// before we get here.
100 AsmToken
AsmLexer::LexHexFloatLiteral(bool NoIntDigits
) {
101 assert((*CurPtr
== 'p' || *CurPtr
== 'P' || *CurPtr
== '.') &&
102 "unexpected parse state in floating hex");
103 bool NoFracDigits
= true;
105 // Skip the fractional part if there is one
106 if (*CurPtr
== '.') {
109 const char *FracStart
= CurPtr
;
110 while (isxdigit(*CurPtr
))
113 NoFracDigits
= CurPtr
== FracStart
;
116 if (NoIntDigits
&& NoFracDigits
)
117 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
118 "expected at least one significand digit");
120 // Make sure we do have some kind of proper exponent part
121 if (*CurPtr
!= 'p' && *CurPtr
!= 'P')
122 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
123 "expected exponent part 'p'");
126 if (*CurPtr
== '+' || *CurPtr
== '-')
129 // N.b. exponent digits are *not* hex
130 const char *ExpStart
= CurPtr
;
131 while (isdigit(*CurPtr
))
134 if (CurPtr
== ExpStart
)
135 return ReturnError(TokStart
, "invalid hexadecimal floating-point constant: "
136 "expected at least one exponent digit");
138 return AsmToken(AsmToken::Real
, StringRef(TokStart
, CurPtr
- TokStart
));
141 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142 static bool IsIdentifierChar(char c
, bool AllowAt
) {
143 return isalnum(c
) || c
== '_' || c
== '$' || c
== '.' ||
144 (c
== '@' && AllowAt
) || c
== '?';
146 AsmToken
AsmLexer::LexIdentifier() {
147 // Check for floating point literals.
148 if (CurPtr
[-1] == '.' && isdigit(*CurPtr
)) {
149 // Disambiguate a .1243foo identifier from a floating literal.
150 while (isdigit(*CurPtr
))
152 if (*CurPtr
== 'e' || *CurPtr
== 'E' ||
153 !IsIdentifierChar(*CurPtr
, AllowAtInIdentifier
))
154 return LexFloatLiteral();
157 while (IsIdentifierChar(*CurPtr
, AllowAtInIdentifier
))
160 // Handle . as a special case.
161 if (CurPtr
== TokStart
+1 && TokStart
[0] == '.')
162 return AsmToken(AsmToken::Dot
, StringRef(TokStart
, 1));
164 return AsmToken(AsmToken::Identifier
, StringRef(TokStart
, CurPtr
- TokStart
));
167 /// LexSlash: Slash: /
168 /// C-Style Comment: /* ... */
169 AsmToken
AsmLexer::LexSlash() {
171 case '*': break; // C style comment.
172 case '/': return ++CurPtr
, LexLineComment();
173 default: return AsmToken(AsmToken::Slash
, StringRef(CurPtr
-1, 1));
177 ++CurPtr
; // skip the star.
179 int CurChar
= getNextChar();
182 return ReturnError(TokStart
, "unterminated comment");
184 // End of the comment?
185 if (CurPtr
[0] != '/') break;
187 ++CurPtr
; // End the */.
193 /// LexLineComment: Comment: #[^\n]*
195 AsmToken
AsmLexer::LexLineComment() {
196 // FIXME: This is broken if we happen to a comment at the end of a file, which
197 // was .included, and which doesn't end with a newline.
198 int CurChar
= getNextChar();
199 while (CurChar
!= '\n' && CurChar
!= '\r' && CurChar
!= EOF
)
200 CurChar
= getNextChar();
203 return AsmToken(AsmToken::Eof
, StringRef(TokStart
, 0));
204 return AsmToken(AsmToken::EndOfStatement
, StringRef(TokStart
, 0));
207 static void SkipIgnoredIntegerSuffix(const char *&CurPtr
) {
208 // Skip ULL, UL, U, L and LL suffices.
209 if (CurPtr
[0] == 'U')
211 if (CurPtr
[0] == 'L')
213 if (CurPtr
[0] == 'L')
217 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
218 // integer as a hexadecimal, possibly with leading zeroes.
219 static unsigned doLookAhead(const char *&CurPtr
, unsigned DefaultRadix
) {
220 const char *FirstHex
= nullptr;
221 const char *LookAhead
= CurPtr
;
223 if (isdigit(*LookAhead
)) {
225 } else if (isxdigit(*LookAhead
)) {
227 FirstHex
= LookAhead
;
233 bool isHex
= *LookAhead
== 'h' || *LookAhead
== 'H';
234 CurPtr
= isHex
|| !FirstHex
? LookAhead
: FirstHex
;
240 static AsmToken
intToken(StringRef Ref
, APInt
&Value
)
242 if (Value
.isIntN(64))
243 return AsmToken(AsmToken::Integer
, Ref
, Value
);
244 return AsmToken(AsmToken::BigNum
, Ref
, Value
);
247 /// LexDigit: First character is [0-9].
248 /// Local Label: [0-9][:]
249 /// Forward/Backward Label: [0-9][fb]
250 /// Binary integer: 0b[01]+
251 /// Octal integer: 0[0-7]+
252 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
253 /// Decimal integer: [1-9][0-9]*
254 AsmToken
AsmLexer::LexDigit() {
255 // Decimal integer: [1-9][0-9]*
256 if (CurPtr
[-1] != '0' || CurPtr
[0] == '.') {
257 unsigned Radix
= doLookAhead(CurPtr
, 10);
258 bool isHex
= Radix
== 16;
259 // Check for floating point literals.
260 if (!isHex
&& (*CurPtr
== '.' || *CurPtr
== 'e')) {
262 return LexFloatLiteral();
265 StringRef
Result(TokStart
, CurPtr
- TokStart
);
267 APInt
Value(128, 0, true);
268 if (Result
.getAsInteger(Radix
, Value
))
269 return ReturnError(TokStart
, !isHex
? "invalid decimal number" :
270 "invalid hexdecimal number");
272 // Consume the [bB][hH].
273 if (Radix
== 2 || Radix
== 16)
276 // The darwin/x86 (and x86-64) assembler accepts and ignores type
277 // suffices on integer literals.
278 SkipIgnoredIntegerSuffix(CurPtr
);
280 return intToken(Result
, Value
);
283 if (*CurPtr
== 'b') {
285 // See if we actually have "0b" as part of something like "jmp 0b\n"
286 if (!isdigit(CurPtr
[0])) {
288 StringRef
Result(TokStart
, CurPtr
- TokStart
);
289 return AsmToken(AsmToken::Integer
, Result
, 0);
291 const char *NumStart
= CurPtr
;
292 while (CurPtr
[0] == '0' || CurPtr
[0] == '1')
295 // Requires at least one binary digit.
296 if (CurPtr
== NumStart
)
297 return ReturnError(TokStart
, "invalid binary number");
299 StringRef
Result(TokStart
, CurPtr
- TokStart
);
301 APInt
Value(128, 0, true);
302 if (Result
.substr(2).getAsInteger(2, Value
))
303 return ReturnError(TokStart
, "invalid binary number");
305 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
306 // suffixes on integer literals.
307 SkipIgnoredIntegerSuffix(CurPtr
);
309 return intToken(Result
, Value
);
312 if (*CurPtr
== 'x') {
314 const char *NumStart
= CurPtr
;
315 while (isxdigit(CurPtr
[0]))
318 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
319 // diagnosed by LexHexFloatLiteral).
320 if (CurPtr
[0] == '.' || CurPtr
[0] == 'p' || CurPtr
[0] == 'P')
321 return LexHexFloatLiteral(NumStart
== CurPtr
);
323 // Otherwise requires at least one hex digit.
324 if (CurPtr
== NumStart
)
325 return ReturnError(CurPtr
-2, "invalid hexadecimal number");
327 APInt
Result(128, 0);
328 if (StringRef(TokStart
, CurPtr
- TokStart
).getAsInteger(0, Result
))
329 return ReturnError(TokStart
, "invalid hexadecimal number");
331 // Consume the optional [hH].
332 if (*CurPtr
== 'h' || *CurPtr
== 'H')
335 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
336 // suffixes on integer literals.
337 SkipIgnoredIntegerSuffix(CurPtr
);
339 return intToken(StringRef(TokStart
, CurPtr
- TokStart
), Result
);
342 // Either octal or hexadecimal.
343 APInt
Value(128, 0, true);
344 unsigned Radix
= doLookAhead(CurPtr
, 8);
345 bool isHex
= Radix
== 16;
346 StringRef
Result(TokStart
, CurPtr
- TokStart
);
347 if (Result
.getAsInteger(Radix
, Value
))
348 return ReturnError(TokStart
, !isHex
? "invalid octal number" :
349 "invalid hexdecimal number");
355 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
356 // suffixes on integer literals.
357 SkipIgnoredIntegerSuffix(CurPtr
);
359 return intToken(Result
, Value
);
362 /// LexSingleQuote: Integer: 'b'
363 AsmToken
AsmLexer::LexSingleQuote() {
364 int CurChar
= getNextChar();
367 CurChar
= getNextChar();
370 return ReturnError(TokStart
, "unterminated single quote");
372 CurChar
= getNextChar();
375 return ReturnError(TokStart
, "single quote way too long");
377 // The idea here being that 'c' is basically just an integral
379 StringRef Res
= StringRef(TokStart
,CurPtr
- TokStart
);
382 if (Res
.startswith("\'\\")) {
383 char theChar
= Res
[2];
385 default: Value
= theChar
; break;
386 case '\'': Value
= '\''; break;
387 case 't': Value
= '\t'; break;
388 case 'n': Value
= '\n'; break;
389 case 'b': Value
= '\b'; break;
394 return AsmToken(AsmToken::Integer
, Res
, Value
);
398 /// LexQuote: String: "..."
399 AsmToken
AsmLexer::LexQuote() {
400 int CurChar
= getNextChar();
401 // TODO: does gas allow multiline string constants?
402 while (CurChar
!= '"') {
403 if (CurChar
== '\\') {
405 CurChar
= getNextChar();
409 return ReturnError(TokStart
, "unterminated string constant");
411 CurChar
= getNextChar();
414 return AsmToken(AsmToken::String
, StringRef(TokStart
, CurPtr
- TokStart
));
417 StringRef
AsmLexer::LexUntilEndOfStatement() {
420 while (!isAtStartOfComment(CurPtr
) && // Start of line comment.
421 !isAtStatementSeparator(CurPtr
) && // End of statement marker.
422 *CurPtr
!= '\n' && *CurPtr
!= '\r' &&
423 (*CurPtr
!= 0 || CurPtr
!= CurBuf
.end())) {
426 return StringRef(TokStart
, CurPtr
-TokStart
);
429 StringRef
AsmLexer::LexUntilEndOfLine() {
432 while (*CurPtr
!= '\n' && *CurPtr
!= '\r' &&
433 (*CurPtr
!= 0 || CurPtr
!= CurBuf
.end())) {
436 return StringRef(TokStart
, CurPtr
-TokStart
);
439 const AsmToken
AsmLexer::peekTok(bool ShouldSkipSpace
) {
440 const char *SavedTokStart
= TokStart
;
441 const char *SavedCurPtr
= CurPtr
;
442 bool SavedAtStartOfLine
= isAtStartOfLine
;
443 bool SavedSkipSpace
= SkipSpace
;
445 std::string SavedErr
= getErr();
446 SMLoc SavedErrLoc
= getErrLoc();
448 SkipSpace
= ShouldSkipSpace
;
449 AsmToken Token
= LexToken();
451 SetError(SavedErrLoc
, SavedErr
);
453 SkipSpace
= SavedSkipSpace
;
454 isAtStartOfLine
= SavedAtStartOfLine
;
455 CurPtr
= SavedCurPtr
;
456 TokStart
= SavedTokStart
;
461 bool AsmLexer::isAtStartOfComment(const char *Ptr
) {
462 const char *CommentString
= MAI
.getCommentString();
464 if (CommentString
[1] == '\0')
465 return CommentString
[0] == Ptr
[0];
467 // FIXME: special case for the bogus "##" comment string in X86MCAsmInfoDarwin
468 if (CommentString
[1] == '#')
469 return CommentString
[0] == Ptr
[0];
471 return strncmp(Ptr
, CommentString
, strlen(CommentString
)) == 0;
474 bool AsmLexer::isAtStatementSeparator(const char *Ptr
) {
475 return strncmp(Ptr
, MAI
.getSeparatorString(),
476 strlen(MAI
.getSeparatorString())) == 0;
479 AsmToken
AsmLexer::LexToken() {
481 // This always consumes at least one character.
482 int CurChar
= getNextChar();
484 if (isAtStartOfComment(TokStart
)) {
485 // If this comment starts with a '#', then return the Hash token and let
486 // the assembler parser see if it can be parsed as a cpp line filename
487 // comment. We do this only if we are at the start of a line.
488 if (CurChar
== '#' && isAtStartOfLine
)
489 return AsmToken(AsmToken::Hash
, StringRef(TokStart
, 1));
490 isAtStartOfLine
= true;
491 return LexLineComment();
493 if (isAtStatementSeparator(TokStart
)) {
494 CurPtr
+= strlen(MAI
.getSeparatorString()) - 1;
495 return AsmToken(AsmToken::EndOfStatement
,
496 StringRef(TokStart
, strlen(MAI
.getSeparatorString())));
499 // If we're missing a newline at EOF, make sure we still get an
500 // EndOfStatement token before the Eof token.
501 if (CurChar
== EOF
&& !isAtStartOfLine
) {
502 isAtStartOfLine
= true;
503 return AsmToken(AsmToken::EndOfStatement
, StringRef(TokStart
, 1));
506 isAtStartOfLine
= false;
509 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
510 if (isalpha(CurChar
) || CurChar
== '_' || CurChar
== '.')
511 return LexIdentifier();
513 // Unknown character, emit an error.
514 return ReturnError(TokStart
, "invalid character in input");
515 case EOF
: return AsmToken(AsmToken::Eof
, StringRef(TokStart
, 0));
520 // Ignore whitespace.
524 while (*CurPtr
==' ' || *CurPtr
=='\t') {
528 return AsmToken(AsmToken::Space
, StringRef(TokStart
, len
));
530 case '\n': // FALL THROUGH.
532 isAtStartOfLine
= true;
533 return AsmToken(AsmToken::EndOfStatement
, StringRef(TokStart
, 1));
534 case ':': return AsmToken(AsmToken::Colon
, StringRef(TokStart
, 1));
535 case '+': return AsmToken(AsmToken::Plus
, StringRef(TokStart
, 1));
536 case '-': return AsmToken(AsmToken::Minus
, StringRef(TokStart
, 1));
537 case '~': return AsmToken(AsmToken::Tilde
, StringRef(TokStart
, 1));
538 case '(': return AsmToken(AsmToken::LParen
, StringRef(TokStart
, 1));
539 case ')': return AsmToken(AsmToken::RParen
, StringRef(TokStart
, 1));
540 case '[': return AsmToken(AsmToken::LBrac
, StringRef(TokStart
, 1));
541 case ']': return AsmToken(AsmToken::RBrac
, StringRef(TokStart
, 1));
542 case '{': return AsmToken(AsmToken::LCurly
, StringRef(TokStart
, 1));
543 case '}': return AsmToken(AsmToken::RCurly
, StringRef(TokStart
, 1));
544 case '*': return AsmToken(AsmToken::Star
, StringRef(TokStart
, 1));
545 case ',': return AsmToken(AsmToken::Comma
, StringRef(TokStart
, 1));
546 case '$': return AsmToken(AsmToken::Dollar
, StringRef(TokStart
, 1));
547 case '@': return AsmToken(AsmToken::At
, StringRef(TokStart
, 1));
548 case '\\': return AsmToken(AsmToken::BackSlash
, StringRef(TokStart
, 1));
551 return ++CurPtr
, AsmToken(AsmToken::EqualEqual
, StringRef(TokStart
, 2));
552 return AsmToken(AsmToken::Equal
, StringRef(TokStart
, 1));
555 return ++CurPtr
, AsmToken(AsmToken::PipePipe
, StringRef(TokStart
, 2));
556 return AsmToken(AsmToken::Pipe
, StringRef(TokStart
, 1));
557 case '^': return AsmToken(AsmToken::Caret
, StringRef(TokStart
, 1));
560 return ++CurPtr
, AsmToken(AsmToken::AmpAmp
, StringRef(TokStart
, 2));
561 return AsmToken(AsmToken::Amp
, StringRef(TokStart
, 1));
564 return ++CurPtr
, AsmToken(AsmToken::ExclaimEqual
, StringRef(TokStart
, 2));
565 return AsmToken(AsmToken::Exclaim
, StringRef(TokStart
, 1));
566 case '%': return AsmToken(AsmToken::Percent
, StringRef(TokStart
, 1));
567 case '/': return LexSlash();
568 case '#': return AsmToken(AsmToken::Hash
, StringRef(TokStart
, 1));
569 case '\'': return LexSingleQuote();
570 case '"': return LexQuote();
571 case '0': case '1': case '2': case '3': case '4':
572 case '5': case '6': case '7': case '8': case '9':
576 case '<': return ++CurPtr
, AsmToken(AsmToken::LessLess
,
577 StringRef(TokStart
, 2));
578 case '=': return ++CurPtr
, AsmToken(AsmToken::LessEqual
,
579 StringRef(TokStart
, 2));
580 case '>': return ++CurPtr
, AsmToken(AsmToken::LessGreater
,
581 StringRef(TokStart
, 2));
582 default: return AsmToken(AsmToken::Less
, StringRef(TokStart
, 1));
586 case '>': return ++CurPtr
, AsmToken(AsmToken::GreaterGreater
,
587 StringRef(TokStart
, 2));
588 case '=': return ++CurPtr
, AsmToken(AsmToken::GreaterEqual
,
589 StringRef(TokStart
, 2));
590 default: return AsmToken(AsmToken::Greater
, StringRef(TokStart
, 1));
593 // TODO: Quoted identifiers (objc methods etc)
594 // local labels: [0-9][:]
595 // Forward/backward labels: [0-9][fb]
596 // Integers, fp constants, character constants.