]>
Commit | Line | Data |
---|---|---|
223e47cc LB |
1 | //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | // Implement the Lexer for TableGen. | |
11 | // | |
12 | //===----------------------------------------------------------------------===// | |
13 | ||
14 | #include "TGLexer.h" | |
223e47cc LB |
15 | #include "llvm/ADT/StringSwitch.h" |
16 | #include "llvm/ADT/Twine.h" | |
970d7e83 LB |
17 | #include "llvm/Config/config.h" // for strtoull()/strtoll() define |
18 | #include "llvm/Support/MemoryBuffer.h" | |
19 | #include "llvm/Support/SourceMgr.h" | |
20 | #include "llvm/TableGen/Error.h" | |
223e47cc | 21 | #include <cctype> |
970d7e83 | 22 | #include <cerrno> |
223e47cc LB |
23 | #include <cstdio> |
24 | #include <cstdlib> | |
25 | #include <cstring> | |
223e47cc LB |
26 | |
27 | using namespace llvm; | |
28 | ||
29 | TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { | |
1a4d82fc JJ |
30 | CurBuffer = SrcMgr.getMainFileID(); |
31 | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); | |
32 | CurPtr = CurBuf.begin(); | |
33 | TokStart = nullptr; | |
223e47cc LB |
34 | } |
35 | ||
36 | SMLoc TGLexer::getLoc() const { | |
37 | return SMLoc::getFromPointer(TokStart); | |
38 | } | |
39 | ||
40 | /// ReturnError - Set the error to the specified string at the specified | |
41 | /// location. This is defined to always return tgtok::Error. | |
42 | tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { | |
43 | PrintError(Loc, Msg); | |
44 | return tgtok::Error; | |
45 | } | |
46 | ||
47 | int TGLexer::getNextChar() { | |
48 | char CurChar = *CurPtr++; | |
49 | switch (CurChar) { | |
50 | default: | |
51 | return (unsigned char)CurChar; | |
52 | case 0: { | |
53 | // A nul character in the stream is either the end of the current buffer or | |
54 | // a random nul in the file. Disambiguate that here. | |
1a4d82fc | 55 | if (CurPtr-1 != CurBuf.end()) |
223e47cc LB |
56 | return 0; // Just whitespace. |
57 | ||
58 | // If this is the end of an included file, pop the parent file off the | |
59 | // include stack. | |
60 | SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); | |
61 | if (ParentIncludeLoc != SMLoc()) { | |
62 | CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); | |
1a4d82fc | 63 | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); |
223e47cc LB |
64 | CurPtr = ParentIncludeLoc.getPointer(); |
65 | return getNextChar(); | |
66 | } | |
67 | ||
68 | // Otherwise, return end of file. | |
69 | --CurPtr; // Another call to lex will return EOF again. | |
70 | return EOF; | |
71 | } | |
72 | case '\n': | |
73 | case '\r': | |
74 | // Handle the newline character by ignoring it and incrementing the line | |
75 | // count. However, be careful about 'dos style' files with \n\r in them. | |
76 | // Only treat a \n\r or \r\n as a single line. | |
77 | if ((*CurPtr == '\n' || (*CurPtr == '\r')) && | |
78 | *CurPtr != CurChar) | |
79 | ++CurPtr; // Eat the two char newline sequence. | |
80 | return '\n'; | |
81 | } | |
82 | } | |
83 | ||
84 | int TGLexer::peekNextChar(int Index) { | |
85 | return *(CurPtr + Index); | |
86 | } | |
87 | ||
88 | tgtok::TokKind TGLexer::LexToken() { | |
89 | TokStart = CurPtr; | |
90 | // This always consumes at least one character. | |
91 | int CurChar = getNextChar(); | |
92 | ||
93 | switch (CurChar) { | |
94 | default: | |
95 | // Handle letters: [a-zA-Z_] | |
96 | if (isalpha(CurChar) || CurChar == '_') | |
97 | return LexIdentifier(); | |
98 | ||
99 | // Unknown character, emit an error. | |
100 | return ReturnError(TokStart, "Unexpected character"); | |
101 | case EOF: return tgtok::Eof; | |
102 | case ':': return tgtok::colon; | |
103 | case ';': return tgtok::semi; | |
104 | case '.': return tgtok::period; | |
105 | case ',': return tgtok::comma; | |
106 | case '<': return tgtok::less; | |
107 | case '>': return tgtok::greater; | |
108 | case ']': return tgtok::r_square; | |
109 | case '{': return tgtok::l_brace; | |
110 | case '}': return tgtok::r_brace; | |
111 | case '(': return tgtok::l_paren; | |
112 | case ')': return tgtok::r_paren; | |
113 | case '=': return tgtok::equal; | |
114 | case '?': return tgtok::question; | |
115 | case '#': return tgtok::paste; | |
116 | ||
117 | case 0: | |
118 | case ' ': | |
119 | case '\t': | |
120 | case '\n': | |
121 | case '\r': | |
122 | // Ignore whitespace. | |
123 | return LexToken(); | |
124 | case '/': | |
125 | // If this is the start of a // comment, skip until the end of the line or | |
126 | // the end of the buffer. | |
127 | if (*CurPtr == '/') | |
128 | SkipBCPLComment(); | |
129 | else if (*CurPtr == '*') { | |
130 | if (SkipCComment()) | |
131 | return tgtok::Error; | |
132 | } else // Otherwise, this is an error. | |
133 | return ReturnError(TokStart, "Unexpected character"); | |
134 | return LexToken(); | |
135 | case '-': case '+': | |
136 | case '0': case '1': case '2': case '3': case '4': case '5': case '6': | |
137 | case '7': case '8': case '9': { | |
138 | int NextChar = 0; | |
139 | if (isdigit(CurChar)) { | |
140 | // Allow identifiers to start with a number if it is followed by | |
141 | // an identifier. This can happen with paste operations like | |
142 | // foo#8i. | |
143 | int i = 0; | |
144 | do { | |
145 | NextChar = peekNextChar(i++); | |
146 | } while (isdigit(NextChar)); | |
147 | ||
148 | if (NextChar == 'x' || NextChar == 'b') { | |
149 | // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most | |
150 | // likely a number. | |
151 | int NextNextChar = peekNextChar(i); | |
152 | switch (NextNextChar) { | |
153 | default: | |
154 | break; | |
155 | case '0': case '1': | |
156 | if (NextChar == 'b') | |
157 | return LexNumber(); | |
158 | // Fallthrough | |
159 | case '2': case '3': case '4': case '5': | |
160 | case '6': case '7': case '8': case '9': | |
161 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
162 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
163 | if (NextChar == 'x') | |
164 | return LexNumber(); | |
165 | break; | |
166 | } | |
167 | } | |
168 | } | |
169 | ||
170 | if (isalpha(NextChar) || NextChar == '_') | |
171 | return LexIdentifier(); | |
172 | ||
173 | return LexNumber(); | |
174 | } | |
175 | case '"': return LexString(); | |
176 | case '$': return LexVarName(); | |
177 | case '[': return LexBracket(); | |
178 | case '!': return LexExclaim(); | |
179 | } | |
180 | } | |
181 | ||
182 | /// LexString - Lex "[^"]*" | |
183 | tgtok::TokKind TGLexer::LexString() { | |
184 | const char *StrStart = CurPtr; | |
185 | ||
186 | CurStrVal = ""; | |
187 | ||
188 | while (*CurPtr != '"') { | |
189 | // If we hit the end of the buffer, report an error. | |
1a4d82fc | 190 | if (*CurPtr == 0 && CurPtr == CurBuf.end()) |
223e47cc LB |
191 | return ReturnError(StrStart, "End of file in string literal"); |
192 | ||
193 | if (*CurPtr == '\n' || *CurPtr == '\r') | |
194 | return ReturnError(StrStart, "End of line in string literal"); | |
195 | ||
196 | if (*CurPtr != '\\') { | |
197 | CurStrVal += *CurPtr++; | |
198 | continue; | |
199 | } | |
200 | ||
201 | ++CurPtr; | |
202 | ||
203 | switch (*CurPtr) { | |
204 | case '\\': case '\'': case '"': | |
205 | // These turn into their literal character. | |
206 | CurStrVal += *CurPtr++; | |
207 | break; | |
208 | case 't': | |
209 | CurStrVal += '\t'; | |
210 | ++CurPtr; | |
211 | break; | |
212 | case 'n': | |
213 | CurStrVal += '\n'; | |
214 | ++CurPtr; | |
215 | break; | |
216 | ||
217 | case '\n': | |
218 | case '\r': | |
219 | return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); | |
220 | ||
221 | // If we hit the end of the buffer, report an error. | |
222 | case '\0': | |
1a4d82fc | 223 | if (CurPtr == CurBuf.end()) |
223e47cc LB |
224 | return ReturnError(StrStart, "End of file in string literal"); |
225 | // FALL THROUGH | |
226 | default: | |
227 | return ReturnError(CurPtr, "invalid escape in string literal"); | |
228 | } | |
229 | } | |
230 | ||
231 | ++CurPtr; | |
232 | return tgtok::StrVal; | |
233 | } | |
234 | ||
235 | tgtok::TokKind TGLexer::LexVarName() { | |
236 | if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') | |
237 | return ReturnError(TokStart, "Invalid variable name"); | |
238 | ||
239 | // Otherwise, we're ok, consume the rest of the characters. | |
240 | const char *VarNameStart = CurPtr++; | |
241 | ||
242 | while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') | |
243 | ++CurPtr; | |
244 | ||
245 | CurStrVal.assign(VarNameStart, CurPtr); | |
246 | return tgtok::VarName; | |
247 | } | |
248 | ||
249 | ||
250 | tgtok::TokKind TGLexer::LexIdentifier() { | |
251 | // The first letter is [a-zA-Z_#]. | |
252 | const char *IdentStart = TokStart; | |
253 | ||
254 | // Match the rest of the identifier regex: [0-9a-zA-Z_#]* | |
255 | while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') | |
256 | ++CurPtr; | |
257 | ||
258 | // Check to see if this identifier is a keyword. | |
259 | StringRef Str(IdentStart, CurPtr-IdentStart); | |
260 | ||
261 | if (Str == "include") { | |
262 | if (LexInclude()) return tgtok::Error; | |
263 | return Lex(); | |
264 | } | |
265 | ||
266 | tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) | |
267 | .Case("int", tgtok::Int) | |
268 | .Case("bit", tgtok::Bit) | |
269 | .Case("bits", tgtok::Bits) | |
270 | .Case("string", tgtok::String) | |
271 | .Case("list", tgtok::List) | |
272 | .Case("code", tgtok::Code) | |
273 | .Case("dag", tgtok::Dag) | |
274 | .Case("class", tgtok::Class) | |
275 | .Case("def", tgtok::Def) | |
276 | .Case("foreach", tgtok::Foreach) | |
277 | .Case("defm", tgtok::Defm) | |
278 | .Case("multiclass", tgtok::MultiClass) | |
279 | .Case("field", tgtok::Field) | |
280 | .Case("let", tgtok::Let) | |
281 | .Case("in", tgtok::In) | |
282 | .Default(tgtok::Id); | |
283 | ||
284 | if (Kind == tgtok::Id) | |
285 | CurStrVal.assign(Str.begin(), Str.end()); | |
286 | return Kind; | |
287 | } | |
288 | ||
289 | /// LexInclude - We just read the "include" token. Get the string token that | |
290 | /// comes next and enter the include. | |
291 | bool TGLexer::LexInclude() { | |
292 | // The token after the include must be a string. | |
293 | tgtok::TokKind Tok = LexToken(); | |
294 | if (Tok == tgtok::Error) return true; | |
295 | if (Tok != tgtok::StrVal) { | |
296 | PrintError(getLoc(), "Expected filename after include"); | |
297 | return true; | |
298 | } | |
299 | ||
300 | // Get the string. | |
301 | std::string Filename = CurStrVal; | |
302 | std::string IncludedFile; | |
303 | ||
304 | ||
305 | CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), | |
306 | IncludedFile); | |
1a4d82fc | 307 | if (!CurBuffer) { |
223e47cc LB |
308 | PrintError(getLoc(), "Could not find include file '" + Filename + "'"); |
309 | return true; | |
310 | } | |
311 | ||
970d7e83 LB |
312 | DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); |
313 | if (Found != Dependencies.end()) { | |
314 | PrintError(getLoc(), | |
315 | "File '" + IncludedFile + "' has already been included."); | |
316 | SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, | |
317 | "previously included here"); | |
318 | return true; | |
319 | } | |
320 | Dependencies.insert(std::make_pair(IncludedFile, getLoc())); | |
223e47cc | 321 | // Save the line number and lex buffer of the includer. |
1a4d82fc JJ |
322 | CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); |
323 | CurPtr = CurBuf.begin(); | |
223e47cc LB |
324 | return false; |
325 | } | |
326 | ||
327 | void TGLexer::SkipBCPLComment() { | |
328 | ++CurPtr; // skip the second slash. | |
329 | while (1) { | |
330 | switch (*CurPtr) { | |
331 | case '\n': | |
332 | case '\r': | |
333 | return; // Newline is end of comment. | |
334 | case 0: | |
335 | // If this is the end of the buffer, end the comment. | |
1a4d82fc | 336 | if (CurPtr == CurBuf.end()) |
223e47cc LB |
337 | return; |
338 | break; | |
339 | } | |
340 | // Otherwise, skip the character. | |
341 | ++CurPtr; | |
342 | } | |
343 | } | |
344 | ||
345 | /// SkipCComment - This skips C-style /**/ comments. The only difference from C | |
346 | /// is that we allow nesting. | |
347 | bool TGLexer::SkipCComment() { | |
348 | ++CurPtr; // skip the star. | |
349 | unsigned CommentDepth = 1; | |
350 | ||
351 | while (1) { | |
352 | int CurChar = getNextChar(); | |
353 | switch (CurChar) { | |
354 | case EOF: | |
355 | PrintError(TokStart, "Unterminated comment!"); | |
356 | return true; | |
357 | case '*': | |
358 | // End of the comment? | |
359 | if (CurPtr[0] != '/') break; | |
360 | ||
361 | ++CurPtr; // End the */. | |
362 | if (--CommentDepth == 0) | |
363 | return false; | |
364 | break; | |
365 | case '/': | |
366 | // Start of a nested comment? | |
367 | if (CurPtr[0] != '*') break; | |
368 | ++CurPtr; | |
369 | ++CommentDepth; | |
370 | break; | |
371 | } | |
372 | } | |
373 | } | |
374 | ||
375 | /// LexNumber - Lex: | |
376 | /// [-+]?[0-9]+ | |
377 | /// 0x[0-9a-fA-F]+ | |
378 | /// 0b[01]+ | |
379 | tgtok::TokKind TGLexer::LexNumber() { | |
380 | if (CurPtr[-1] == '0') { | |
381 | if (CurPtr[0] == 'x') { | |
382 | ++CurPtr; | |
383 | const char *NumStart = CurPtr; | |
384 | while (isxdigit(CurPtr[0])) | |
385 | ++CurPtr; | |
386 | ||
387 | // Requires at least one hex digit. | |
388 | if (CurPtr == NumStart) | |
389 | return ReturnError(TokStart, "Invalid hexadecimal number"); | |
390 | ||
391 | errno = 0; | |
1a4d82fc | 392 | CurIntVal = strtoll(NumStart, nullptr, 16); |
223e47cc LB |
393 | if (errno == EINVAL) |
394 | return ReturnError(TokStart, "Invalid hexadecimal number"); | |
395 | if (errno == ERANGE) { | |
396 | errno = 0; | |
1a4d82fc | 397 | CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); |
223e47cc LB |
398 | if (errno == EINVAL) |
399 | return ReturnError(TokStart, "Invalid hexadecimal number"); | |
400 | if (errno == ERANGE) | |
401 | return ReturnError(TokStart, "Hexadecimal number out of range"); | |
402 | } | |
403 | return tgtok::IntVal; | |
404 | } else if (CurPtr[0] == 'b') { | |
405 | ++CurPtr; | |
406 | const char *NumStart = CurPtr; | |
407 | while (CurPtr[0] == '0' || CurPtr[0] == '1') | |
408 | ++CurPtr; | |
409 | ||
410 | // Requires at least one binary digit. | |
411 | if (CurPtr == NumStart) | |
412 | return ReturnError(CurPtr-2, "Invalid binary number"); | |
1a4d82fc JJ |
413 | CurIntVal = strtoll(NumStart, nullptr, 2); |
414 | return tgtok::BinaryIntVal; | |
223e47cc LB |
415 | } |
416 | } | |
417 | ||
418 | // Check for a sign without a digit. | |
419 | if (!isdigit(CurPtr[0])) { | |
420 | if (CurPtr[-1] == '-') | |
421 | return tgtok::minus; | |
422 | else if (CurPtr[-1] == '+') | |
423 | return tgtok::plus; | |
424 | } | |
425 | ||
426 | while (isdigit(CurPtr[0])) | |
427 | ++CurPtr; | |
1a4d82fc | 428 | CurIntVal = strtoll(TokStart, nullptr, 10); |
223e47cc LB |
429 | return tgtok::IntVal; |
430 | } | |
431 | ||
432 | /// LexBracket - We just read '['. If this is a code block, return it, | |
433 | /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' | |
434 | tgtok::TokKind TGLexer::LexBracket() { | |
435 | if (CurPtr[0] != '{') | |
436 | return tgtok::l_square; | |
437 | ++CurPtr; | |
438 | const char *CodeStart = CurPtr; | |
439 | while (1) { | |
440 | int Char = getNextChar(); | |
441 | if (Char == EOF) break; | |
442 | ||
443 | if (Char != '}') continue; | |
444 | ||
445 | Char = getNextChar(); | |
446 | if (Char == EOF) break; | |
447 | if (Char == ']') { | |
448 | CurStrVal.assign(CodeStart, CurPtr-2); | |
449 | return tgtok::CodeFragment; | |
450 | } | |
451 | } | |
452 | ||
453 | return ReturnError(CodeStart-2, "Unterminated Code Block"); | |
454 | } | |
455 | ||
456 | /// LexExclaim - Lex '!' and '![a-zA-Z]+'. | |
457 | tgtok::TokKind TGLexer::LexExclaim() { | |
458 | if (!isalpha(*CurPtr)) | |
459 | return ReturnError(CurPtr - 1, "Invalid \"!operator\""); | |
460 | ||
461 | const char *Start = CurPtr++; | |
462 | while (isalpha(*CurPtr)) | |
463 | ++CurPtr; | |
464 | ||
465 | // Check to see which operator this is. | |
466 | tgtok::TokKind Kind = | |
467 | StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) | |
468 | .Case("eq", tgtok::XEq) | |
469 | .Case("if", tgtok::XIf) | |
470 | .Case("head", tgtok::XHead) | |
471 | .Case("tail", tgtok::XTail) | |
472 | .Case("con", tgtok::XConcat) | |
970d7e83 | 473 | .Case("add", tgtok::XADD) |
1a4d82fc | 474 | .Case("and", tgtok::XAND) |
223e47cc LB |
475 | .Case("shl", tgtok::XSHL) |
476 | .Case("sra", tgtok::XSRA) | |
477 | .Case("srl", tgtok::XSRL) | |
478 | .Case("cast", tgtok::XCast) | |
479 | .Case("empty", tgtok::XEmpty) | |
480 | .Case("subst", tgtok::XSubst) | |
481 | .Case("foreach", tgtok::XForEach) | |
1a4d82fc | 482 | .Case("listconcat", tgtok::XListConcat) |
223e47cc LB |
483 | .Case("strconcat", tgtok::XStrConcat) |
484 | .Default(tgtok::Error); | |
485 | ||
486 | return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); | |
487 | } | |
488 |