]>
Commit | Line | Data |
---|---|---|
223e47cc LB |
1 | //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | // This file implements a YAML parser. | |
11 | // | |
12 | //===----------------------------------------------------------------------===// | |
13 | ||
14 | #include "llvm/Support/YAMLParser.h" | |
223e47cc LB |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include "llvm/ADT/StringExtras.h" | |
17 | #include "llvm/ADT/Twine.h" | |
970d7e83 LB |
18 | #include "llvm/ADT/ilist.h" |
19 | #include "llvm/ADT/ilist_node.h" | |
223e47cc LB |
20 | #include "llvm/Support/ErrorHandling.h" |
21 | #include "llvm/Support/MemoryBuffer.h" | |
223e47cc | 22 | #include "llvm/Support/SourceMgr.h" |
970d7e83 | 23 | #include "llvm/Support/raw_ostream.h" |
223e47cc LB |
24 | |
25 | using namespace llvm; | |
26 | using namespace yaml; | |
27 | ||
28 | enum UnicodeEncodingForm { | |
29 | UEF_UTF32_LE, ///< UTF-32 Little Endian | |
30 | UEF_UTF32_BE, ///< UTF-32 Big Endian | |
31 | UEF_UTF16_LE, ///< UTF-16 Little Endian | |
32 | UEF_UTF16_BE, ///< UTF-16 Big Endian | |
33 | UEF_UTF8, ///< UTF-8 or ascii. | |
34 | UEF_Unknown ///< Not a valid Unicode encoding. | |
35 | }; | |
36 | ||
37 | /// EncodingInfo - Holds the encoding type and length of the byte order mark if | |
38 | /// it exists. Length is in {0, 2, 3, 4}. | |
39 | typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; | |
40 | ||
41 | /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode | |
42 | /// encoding form of \a Input. | |
43 | /// | |
44 | /// @param Input A string of length 0 or more. | |
45 | /// @returns An EncodingInfo indicating the Unicode encoding form of the input | |
46 | /// and how long the byte order mark is if one exists. | |
47 | static EncodingInfo getUnicodeEncoding(StringRef Input) { | |
48 | if (Input.size() == 0) | |
49 | return std::make_pair(UEF_Unknown, 0); | |
50 | ||
51 | switch (uint8_t(Input[0])) { | |
52 | case 0x00: | |
53 | if (Input.size() >= 4) { | |
54 | if ( Input[1] == 0 | |
55 | && uint8_t(Input[2]) == 0xFE | |
56 | && uint8_t(Input[3]) == 0xFF) | |
57 | return std::make_pair(UEF_UTF32_BE, 4); | |
58 | if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) | |
59 | return std::make_pair(UEF_UTF32_BE, 0); | |
60 | } | |
61 | ||
62 | if (Input.size() >= 2 && Input[1] != 0) | |
63 | return std::make_pair(UEF_UTF16_BE, 0); | |
64 | return std::make_pair(UEF_Unknown, 0); | |
65 | case 0xFF: | |
66 | if ( Input.size() >= 4 | |
67 | && uint8_t(Input[1]) == 0xFE | |
68 | && Input[2] == 0 | |
69 | && Input[3] == 0) | |
70 | return std::make_pair(UEF_UTF32_LE, 4); | |
71 | ||
72 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) | |
73 | return std::make_pair(UEF_UTF16_LE, 2); | |
74 | return std::make_pair(UEF_Unknown, 0); | |
75 | case 0xFE: | |
76 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) | |
77 | return std::make_pair(UEF_UTF16_BE, 2); | |
78 | return std::make_pair(UEF_Unknown, 0); | |
79 | case 0xEF: | |
80 | if ( Input.size() >= 3 | |
81 | && uint8_t(Input[1]) == 0xBB | |
82 | && uint8_t(Input[2]) == 0xBF) | |
83 | return std::make_pair(UEF_UTF8, 3); | |
84 | return std::make_pair(UEF_Unknown, 0); | |
85 | } | |
86 | ||
87 | // It could still be utf-32 or utf-16. | |
88 | if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) | |
89 | return std::make_pair(UEF_UTF32_LE, 0); | |
90 | ||
91 | if (Input.size() >= 2 && Input[1] == 0) | |
92 | return std::make_pair(UEF_UTF16_LE, 0); | |
93 | ||
94 | return std::make_pair(UEF_UTF8, 0); | |
95 | } | |
96 | ||
97 | namespace llvm { | |
98 | namespace yaml { | |
1a4d82fc JJ |
99 | /// Pin the vtables to this file. |
100 | void Node::anchor() {} | |
101 | void NullNode::anchor() {} | |
102 | void ScalarNode::anchor() {} | |
103 | void KeyValueNode::anchor() {} | |
104 | void MappingNode::anchor() {} | |
105 | void SequenceNode::anchor() {} | |
106 | void AliasNode::anchor() {} | |
107 | ||
223e47cc LB |
108 | /// Token - A single YAML token. |
109 | struct Token : ilist_node<Token> { | |
110 | enum TokenKind { | |
111 | TK_Error, // Uninitialized token. | |
112 | TK_StreamStart, | |
113 | TK_StreamEnd, | |
114 | TK_VersionDirective, | |
115 | TK_TagDirective, | |
116 | TK_DocumentStart, | |
117 | TK_DocumentEnd, | |
118 | TK_BlockEntry, | |
119 | TK_BlockEnd, | |
120 | TK_BlockSequenceStart, | |
121 | TK_BlockMappingStart, | |
122 | TK_FlowEntry, | |
123 | TK_FlowSequenceStart, | |
124 | TK_FlowSequenceEnd, | |
125 | TK_FlowMappingStart, | |
126 | TK_FlowMappingEnd, | |
127 | TK_Key, | |
128 | TK_Value, | |
129 | TK_Scalar, | |
130 | TK_Alias, | |
131 | TK_Anchor, | |
132 | TK_Tag | |
133 | } Kind; | |
134 | ||
135 | /// A string of length 0 or more whose begin() points to the logical location | |
136 | /// of the token in the input. | |
137 | StringRef Range; | |
138 | ||
139 | Token() : Kind(TK_Error) {} | |
140 | }; | |
141 | } | |
142 | } | |
143 | ||
144 | namespace llvm { | |
145 | template<> | |
146 | struct ilist_sentinel_traits<Token> { | |
147 | Token *createSentinel() const { | |
148 | return &Sentinel; | |
149 | } | |
150 | static void destroySentinel(Token*) {} | |
151 | ||
152 | Token *provideInitialHead() const { return createSentinel(); } | |
153 | Token *ensureHead(Token*) const { return createSentinel(); } | |
154 | static void noteHead(Token*, Token*) {} | |
155 | ||
156 | private: | |
157 | mutable Token Sentinel; | |
158 | }; | |
159 | ||
160 | template<> | |
161 | struct ilist_node_traits<Token> { | |
162 | Token *createNode(const Token &V) { | |
163 | return new (Alloc.Allocate<Token>()) Token(V); | |
164 | } | |
165 | static void deleteNode(Token *V) {} | |
166 | ||
167 | void addNodeToList(Token *) {} | |
168 | void removeNodeFromList(Token *) {} | |
169 | void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, | |
170 | ilist_iterator<Token> /*first*/, | |
171 | ilist_iterator<Token> /*last*/) {} | |
172 | ||
173 | BumpPtrAllocator Alloc; | |
174 | }; | |
175 | } | |
176 | ||
177 | typedef ilist<Token> TokenQueueT; | |
178 | ||
179 | namespace { | |
180 | /// @brief This struct is used to track simple keys. | |
181 | /// | |
182 | /// Simple keys are handled by creating an entry in SimpleKeys for each Token | |
183 | /// which could legally be the start of a simple key. When peekNext is called, | |
184 | /// if the Token To be returned is referenced by a SimpleKey, we continue | |
185 | /// tokenizing until that potential simple key has either been found to not be | |
186 | /// a simple key (we moved on to the next line or went further than 1024 chars). | |
187 | /// Or when we run into a Value, and then insert a Key token (and possibly | |
188 | /// others) before the SimpleKey's Tok. | |
189 | struct SimpleKey { | |
190 | TokenQueueT::iterator Tok; | |
191 | unsigned Column; | |
192 | unsigned Line; | |
193 | unsigned FlowLevel; | |
194 | bool IsRequired; | |
195 | ||
196 | bool operator ==(const SimpleKey &Other) { | |
197 | return Tok == Other.Tok; | |
198 | } | |
199 | }; | |
200 | } | |
201 | ||
202 | /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit | |
203 | /// subsequence and the subsequence's length in code units (uint8_t). | |
204 | /// A length of 0 represents an error. | |
205 | typedef std::pair<uint32_t, unsigned> UTF8Decoded; | |
206 | ||
207 | static UTF8Decoded decodeUTF8(StringRef Range) { | |
208 | StringRef::iterator Position= Range.begin(); | |
209 | StringRef::iterator End = Range.end(); | |
210 | // 1 byte: [0x00, 0x7f] | |
211 | // Bit pattern: 0xxxxxxx | |
212 | if ((*Position & 0x80) == 0) { | |
213 | return std::make_pair(*Position, 1); | |
214 | } | |
215 | // 2 bytes: [0x80, 0x7ff] | |
216 | // Bit pattern: 110xxxxx 10xxxxxx | |
217 | if (Position + 1 != End && | |
218 | ((*Position & 0xE0) == 0xC0) && | |
219 | ((*(Position + 1) & 0xC0) == 0x80)) { | |
220 | uint32_t codepoint = ((*Position & 0x1F) << 6) | | |
221 | (*(Position + 1) & 0x3F); | |
222 | if (codepoint >= 0x80) | |
223 | return std::make_pair(codepoint, 2); | |
224 | } | |
225 | // 3 bytes: [0x8000, 0xffff] | |
226 | // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx | |
227 | if (Position + 2 != End && | |
228 | ((*Position & 0xF0) == 0xE0) && | |
229 | ((*(Position + 1) & 0xC0) == 0x80) && | |
230 | ((*(Position + 2) & 0xC0) == 0x80)) { | |
231 | uint32_t codepoint = ((*Position & 0x0F) << 12) | | |
232 | ((*(Position + 1) & 0x3F) << 6) | | |
233 | (*(Position + 2) & 0x3F); | |
234 | // Codepoints between 0xD800 and 0xDFFF are invalid, as | |
235 | // they are high / low surrogate halves used by UTF-16. | |
236 | if (codepoint >= 0x800 && | |
237 | (codepoint < 0xD800 || codepoint > 0xDFFF)) | |
238 | return std::make_pair(codepoint, 3); | |
239 | } | |
240 | // 4 bytes: [0x10000, 0x10FFFF] | |
241 | // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
242 | if (Position + 3 != End && | |
243 | ((*Position & 0xF8) == 0xF0) && | |
244 | ((*(Position + 1) & 0xC0) == 0x80) && | |
245 | ((*(Position + 2) & 0xC0) == 0x80) && | |
246 | ((*(Position + 3) & 0xC0) == 0x80)) { | |
247 | uint32_t codepoint = ((*Position & 0x07) << 18) | | |
248 | ((*(Position + 1) & 0x3F) << 12) | | |
249 | ((*(Position + 2) & 0x3F) << 6) | | |
250 | (*(Position + 3) & 0x3F); | |
251 | if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) | |
252 | return std::make_pair(codepoint, 4); | |
253 | } | |
254 | return std::make_pair(0, 0); | |
255 | } | |
256 | ||
257 | namespace llvm { | |
258 | namespace yaml { | |
259 | /// @brief Scans YAML tokens from a MemoryBuffer. | |
260 | class Scanner { | |
261 | public: | |
1a4d82fc JJ |
262 | Scanner(StringRef Input, SourceMgr &SM); |
263 | Scanner(MemoryBufferRef Buffer, SourceMgr &SM_); | |
223e47cc LB |
264 | |
265 | /// @brief Parse the next token and return it without popping it. | |
266 | Token &peekNext(); | |
267 | ||
268 | /// @brief Parse the next token and pop it from the queue. | |
269 | Token getNext(); | |
270 | ||
271 | void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, | |
1a4d82fc | 272 | ArrayRef<SMRange> Ranges = None) { |
223e47cc LB |
273 | SM.PrintMessage(Loc, Kind, Message, Ranges); |
274 | } | |
275 | ||
276 | void setError(const Twine &Message, StringRef::iterator Position) { | |
277 | if (Current >= End) | |
278 | Current = End - 1; | |
279 | ||
280 | // Don't print out more errors after the first one we encounter. The rest | |
281 | // are just the result of the first, and have no meaning. | |
282 | if (!Failed) | |
283 | printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); | |
284 | Failed = true; | |
285 | } | |
286 | ||
287 | void setError(const Twine &Message) { | |
288 | setError(Message, Current); | |
289 | } | |
290 | ||
291 | /// @brief Returns true if an error occurred while parsing. | |
292 | bool failed() { | |
293 | return Failed; | |
294 | } | |
295 | ||
296 | private: | |
1a4d82fc JJ |
297 | void init(MemoryBufferRef Buffer); |
298 | ||
223e47cc LB |
299 | StringRef currentInput() { |
300 | return StringRef(Current, End - Current); | |
301 | } | |
302 | ||
303 | /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting | |
304 | /// at \a Position. | |
305 | /// | |
306 | /// If the UTF-8 code units starting at Position do not form a well-formed | |
307 | /// code unit subsequence, then the Unicode scalar value is 0, and the length | |
308 | /// is 0. | |
309 | UTF8Decoded decodeUTF8(StringRef::iterator Position) { | |
310 | return ::decodeUTF8(StringRef(Position, End - Position)); | |
311 | } | |
312 | ||
313 | // The following functions are based on the gramar rules in the YAML spec. The | |
314 | // style of the function names it meant to closely match how they are written | |
315 | // in the spec. The number within the [] is the number of the grammar rule in | |
316 | // the spec. | |
317 | // | |
318 | // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. | |
319 | // | |
320 | // c- | |
321 | // A production starting and ending with a special character. | |
322 | // b- | |
323 | // A production matching a single line break. | |
324 | // nb- | |
325 | // A production starting and ending with a non-break character. | |
326 | // s- | |
327 | // A production starting and ending with a white space character. | |
328 | // ns- | |
329 | // A production starting and ending with a non-space character. | |
330 | // l- | |
331 | // A production matching complete line(s). | |
332 | ||
333 | /// @brief Skip a single nb-char[27] starting at Position. | |
334 | /// | |
335 | /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] | |
336 | /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] | |
337 | /// | |
338 | /// @returns The code unit after the nb-char, or Position if it's not an | |
339 | /// nb-char. | |
340 | StringRef::iterator skip_nb_char(StringRef::iterator Position); | |
341 | ||
342 | /// @brief Skip a single b-break[28] starting at Position. | |
343 | /// | |
344 | /// A b-break is 0xD 0xA | 0xD | 0xA | |
345 | /// | |
346 | /// @returns The code unit after the b-break, or Position if it's not a | |
347 | /// b-break. | |
348 | StringRef::iterator skip_b_break(StringRef::iterator Position); | |
349 | ||
350 | /// @brief Skip a single s-white[33] starting at Position. | |
351 | /// | |
352 | /// A s-white is 0x20 | 0x9 | |
353 | /// | |
354 | /// @returns The code unit after the s-white, or Position if it's not a | |
355 | /// s-white. | |
356 | StringRef::iterator skip_s_white(StringRef::iterator Position); | |
357 | ||
358 | /// @brief Skip a single ns-char[34] starting at Position. | |
359 | /// | |
360 | /// A ns-char is nb-char - s-white | |
361 | /// | |
362 | /// @returns The code unit after the ns-char, or Position if it's not a | |
363 | /// ns-char. | |
364 | StringRef::iterator skip_ns_char(StringRef::iterator Position); | |
365 | ||
366 | typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); | |
367 | /// @brief Skip minimal well-formed code unit subsequences until Func | |
368 | /// returns its input. | |
369 | /// | |
370 | /// @returns The code unit after the last minimal well-formed code unit | |
371 | /// subsequence that Func accepted. | |
372 | StringRef::iterator skip_while( SkipWhileFunc Func | |
373 | , StringRef::iterator Position); | |
374 | ||
375 | /// @brief Scan ns-uri-char[39]s starting at Cur. | |
376 | /// | |
377 | /// This updates Cur and Column while scanning. | |
378 | /// | |
379 | /// @returns A StringRef starting at Cur which covers the longest contiguous | |
380 | /// sequence of ns-uri-char. | |
381 | StringRef scan_ns_uri_char(); | |
382 | ||
223e47cc LB |
383 | /// @brief Consume a minimal well-formed code unit subsequence starting at |
384 | /// \a Cur. Return false if it is not the same Unicode scalar value as | |
385 | /// \a Expected. This updates \a Column. | |
386 | bool consume(uint32_t Expected); | |
387 | ||
388 | /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. | |
389 | void skip(uint32_t Distance); | |
390 | ||
391 | /// @brief Return true if the minimal well-formed code unit subsequence at | |
392 | /// Pos is whitespace or a new line | |
393 | bool isBlankOrBreak(StringRef::iterator Position); | |
394 | ||
395 | /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. | |
396 | void saveSimpleKeyCandidate( TokenQueueT::iterator Tok | |
397 | , unsigned AtColumn | |
398 | , bool IsRequired); | |
399 | ||
400 | /// @brief Remove simple keys that can no longer be valid simple keys. | |
401 | /// | |
402 | /// Invalid simple keys are not on the current line or are further than 1024 | |
403 | /// columns back. | |
404 | void removeStaleSimpleKeyCandidates(); | |
405 | ||
406 | /// @brief Remove all simple keys on FlowLevel \a Level. | |
407 | void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); | |
408 | ||
409 | /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd | |
410 | /// tokens if needed. | |
411 | bool unrollIndent(int ToColumn); | |
412 | ||
413 | /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint | |
414 | /// if needed. | |
415 | bool rollIndent( int ToColumn | |
416 | , Token::TokenKind Kind | |
417 | , TokenQueueT::iterator InsertPoint); | |
418 | ||
419 | /// @brief Skip whitespace and comments until the start of the next token. | |
420 | void scanToNextToken(); | |
421 | ||
422 | /// @brief Must be the first token generated. | |
423 | bool scanStreamStart(); | |
424 | ||
425 | /// @brief Generate tokens needed to close out the stream. | |
426 | bool scanStreamEnd(); | |
427 | ||
428 | /// @brief Scan a %BLAH directive. | |
429 | bool scanDirective(); | |
430 | ||
431 | /// @brief Scan a ... or ---. | |
432 | bool scanDocumentIndicator(bool IsStart); | |
433 | ||
434 | /// @brief Scan a [ or { and generate the proper flow collection start token. | |
435 | bool scanFlowCollectionStart(bool IsSequence); | |
436 | ||
437 | /// @brief Scan a ] or } and generate the proper flow collection end token. | |
438 | bool scanFlowCollectionEnd(bool IsSequence); | |
439 | ||
440 | /// @brief Scan the , that separates entries in a flow collection. | |
441 | bool scanFlowEntry(); | |
442 | ||
443 | /// @brief Scan the - that starts block sequence entries. | |
444 | bool scanBlockEntry(); | |
445 | ||
446 | /// @brief Scan an explicit ? indicating a key. | |
447 | bool scanKey(); | |
448 | ||
449 | /// @brief Scan an explicit : indicating a value. | |
450 | bool scanValue(); | |
451 | ||
452 | /// @brief Scan a quoted scalar. | |
453 | bool scanFlowScalar(bool IsDoubleQuoted); | |
454 | ||
455 | /// @brief Scan an unquoted scalar. | |
456 | bool scanPlainScalar(); | |
457 | ||
458 | /// @brief Scan an Alias or Anchor starting with * or &. | |
459 | bool scanAliasOrAnchor(bool IsAlias); | |
460 | ||
461 | /// @brief Scan a block scalar starting with | or >. | |
462 | bool scanBlockScalar(bool IsLiteral); | |
463 | ||
464 | /// @brief Scan a tag of the form !stuff. | |
465 | bool scanTag(); | |
466 | ||
467 | /// @brief Dispatch to the next scanning function based on \a *Cur. | |
468 | bool fetchMoreTokens(); | |
469 | ||
470 | /// @brief The SourceMgr used for diagnostics and buffer management. | |
471 | SourceMgr &SM; | |
472 | ||
473 | /// @brief The original input. | |
1a4d82fc | 474 | MemoryBufferRef InputBuffer; |
223e47cc LB |
475 | |
476 | /// @brief The current position of the scanner. | |
477 | StringRef::iterator Current; | |
478 | ||
479 | /// @brief The end of the input (one past the last character). | |
480 | StringRef::iterator End; | |
481 | ||
482 | /// @brief Current YAML indentation level in spaces. | |
483 | int Indent; | |
484 | ||
485 | /// @brief Current column number in Unicode code points. | |
486 | unsigned Column; | |
487 | ||
488 | /// @brief Current line number. | |
489 | unsigned Line; | |
490 | ||
491 | /// @brief How deep we are in flow style containers. 0 Means at block level. | |
492 | unsigned FlowLevel; | |
493 | ||
494 | /// @brief Are we at the start of the stream? | |
495 | bool IsStartOfStream; | |
496 | ||
497 | /// @brief Can the next token be the start of a simple key? | |
498 | bool IsSimpleKeyAllowed; | |
499 | ||
500 | /// @brief True if an error has occurred. | |
501 | bool Failed; | |
502 | ||
503 | /// @brief Queue of tokens. This is required to queue up tokens while looking | |
504 | /// for the end of a simple key. And for cases where a single character | |
505 | /// can produce multiple tokens (e.g. BlockEnd). | |
506 | TokenQueueT TokenQueue; | |
507 | ||
508 | /// @brief Indentation levels. | |
509 | SmallVector<int, 4> Indents; | |
510 | ||
511 | /// @brief Potential simple keys. | |
512 | SmallVector<SimpleKey, 4> SimpleKeys; | |
513 | }; | |
514 | ||
515 | } // end namespace yaml | |
516 | } // end namespace llvm | |
517 | ||
518 | /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. | |
519 | static void encodeUTF8( uint32_t UnicodeScalarValue | |
520 | , SmallVectorImpl<char> &Result) { | |
521 | if (UnicodeScalarValue <= 0x7F) { | |
522 | Result.push_back(UnicodeScalarValue & 0x7F); | |
523 | } else if (UnicodeScalarValue <= 0x7FF) { | |
524 | uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); | |
525 | uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
526 | Result.push_back(FirstByte); | |
527 | Result.push_back(SecondByte); | |
528 | } else if (UnicodeScalarValue <= 0xFFFF) { | |
529 | uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); | |
530 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); | |
531 | uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
532 | Result.push_back(FirstByte); | |
533 | Result.push_back(SecondByte); | |
534 | Result.push_back(ThirdByte); | |
535 | } else if (UnicodeScalarValue <= 0x10FFFF) { | |
536 | uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); | |
537 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); | |
538 | uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); | |
539 | uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); | |
540 | Result.push_back(FirstByte); | |
541 | Result.push_back(SecondByte); | |
542 | Result.push_back(ThirdByte); | |
543 | Result.push_back(FourthByte); | |
544 | } | |
545 | } | |
546 | ||
547 | bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { | |
548 | SourceMgr SM; | |
549 | Scanner scanner(Input, SM); | |
550 | while (true) { | |
551 | Token T = scanner.getNext(); | |
552 | switch (T.Kind) { | |
553 | case Token::TK_StreamStart: | |
554 | OS << "Stream-Start: "; | |
555 | break; | |
556 | case Token::TK_StreamEnd: | |
557 | OS << "Stream-End: "; | |
558 | break; | |
559 | case Token::TK_VersionDirective: | |
560 | OS << "Version-Directive: "; | |
561 | break; | |
562 | case Token::TK_TagDirective: | |
563 | OS << "Tag-Directive: "; | |
564 | break; | |
565 | case Token::TK_DocumentStart: | |
566 | OS << "Document-Start: "; | |
567 | break; | |
568 | case Token::TK_DocumentEnd: | |
569 | OS << "Document-End: "; | |
570 | break; | |
571 | case Token::TK_BlockEntry: | |
572 | OS << "Block-Entry: "; | |
573 | break; | |
574 | case Token::TK_BlockEnd: | |
575 | OS << "Block-End: "; | |
576 | break; | |
577 | case Token::TK_BlockSequenceStart: | |
578 | OS << "Block-Sequence-Start: "; | |
579 | break; | |
580 | case Token::TK_BlockMappingStart: | |
581 | OS << "Block-Mapping-Start: "; | |
582 | break; | |
583 | case Token::TK_FlowEntry: | |
584 | OS << "Flow-Entry: "; | |
585 | break; | |
586 | case Token::TK_FlowSequenceStart: | |
587 | OS << "Flow-Sequence-Start: "; | |
588 | break; | |
589 | case Token::TK_FlowSequenceEnd: | |
590 | OS << "Flow-Sequence-End: "; | |
591 | break; | |
592 | case Token::TK_FlowMappingStart: | |
593 | OS << "Flow-Mapping-Start: "; | |
594 | break; | |
595 | case Token::TK_FlowMappingEnd: | |
596 | OS << "Flow-Mapping-End: "; | |
597 | break; | |
598 | case Token::TK_Key: | |
599 | OS << "Key: "; | |
600 | break; | |
601 | case Token::TK_Value: | |
602 | OS << "Value: "; | |
603 | break; | |
604 | case Token::TK_Scalar: | |
605 | OS << "Scalar: "; | |
606 | break; | |
607 | case Token::TK_Alias: | |
608 | OS << "Alias: "; | |
609 | break; | |
610 | case Token::TK_Anchor: | |
611 | OS << "Anchor: "; | |
612 | break; | |
613 | case Token::TK_Tag: | |
614 | OS << "Tag: "; | |
615 | break; | |
616 | case Token::TK_Error: | |
617 | break; | |
618 | } | |
619 | OS << T.Range << "\n"; | |
620 | if (T.Kind == Token::TK_StreamEnd) | |
621 | break; | |
622 | else if (T.Kind == Token::TK_Error) | |
623 | return false; | |
624 | } | |
625 | return true; | |
626 | } | |
627 | ||
628 | bool yaml::scanTokens(StringRef Input) { | |
629 | llvm::SourceMgr SM; | |
630 | llvm::yaml::Scanner scanner(Input, SM); | |
631 | for (;;) { | |
632 | llvm::yaml::Token T = scanner.getNext(); | |
633 | if (T.Kind == Token::TK_StreamEnd) | |
634 | break; | |
635 | else if (T.Kind == Token::TK_Error) | |
636 | return false; | |
637 | } | |
638 | return true; | |
639 | } | |
640 | ||
641 | std::string yaml::escape(StringRef Input) { | |
642 | std::string EscapedInput; | |
643 | for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { | |
644 | if (*i == '\\') | |
645 | EscapedInput += "\\\\"; | |
646 | else if (*i == '"') | |
647 | EscapedInput += "\\\""; | |
648 | else if (*i == 0) | |
649 | EscapedInput += "\\0"; | |
650 | else if (*i == 0x07) | |
651 | EscapedInput += "\\a"; | |
652 | else if (*i == 0x08) | |
653 | EscapedInput += "\\b"; | |
654 | else if (*i == 0x09) | |
655 | EscapedInput += "\\t"; | |
656 | else if (*i == 0x0A) | |
657 | EscapedInput += "\\n"; | |
658 | else if (*i == 0x0B) | |
659 | EscapedInput += "\\v"; | |
660 | else if (*i == 0x0C) | |
661 | EscapedInput += "\\f"; | |
662 | else if (*i == 0x0D) | |
663 | EscapedInput += "\\r"; | |
664 | else if (*i == 0x1B) | |
665 | EscapedInput += "\\e"; | |
666 | else if ((unsigned char)*i < 0x20) { // Control characters not handled above. | |
667 | std::string HexStr = utohexstr(*i); | |
668 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; | |
669 | } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. | |
670 | UTF8Decoded UnicodeScalarValue | |
671 | = decodeUTF8(StringRef(i, Input.end() - i)); | |
672 | if (UnicodeScalarValue.second == 0) { | |
673 | // Found invalid char. | |
674 | SmallString<4> Val; | |
675 | encodeUTF8(0xFFFD, Val); | |
676 | EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); | |
677 | // FIXME: Error reporting. | |
678 | return EscapedInput; | |
679 | } | |
680 | if (UnicodeScalarValue.first == 0x85) | |
681 | EscapedInput += "\\N"; | |
682 | else if (UnicodeScalarValue.first == 0xA0) | |
683 | EscapedInput += "\\_"; | |
684 | else if (UnicodeScalarValue.first == 0x2028) | |
685 | EscapedInput += "\\L"; | |
686 | else if (UnicodeScalarValue.first == 0x2029) | |
687 | EscapedInput += "\\P"; | |
688 | else { | |
689 | std::string HexStr = utohexstr(UnicodeScalarValue.first); | |
690 | if (HexStr.size() <= 2) | |
691 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; | |
692 | else if (HexStr.size() <= 4) | |
693 | EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; | |
694 | else if (HexStr.size() <= 8) | |
695 | EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; | |
696 | } | |
697 | i += UnicodeScalarValue.second - 1; | |
698 | } else | |
699 | EscapedInput.push_back(*i); | |
700 | } | |
701 | return EscapedInput; | |
702 | } | |
703 | ||
1a4d82fc JJ |
704 | Scanner::Scanner(StringRef Input, SourceMgr &sm) : SM(sm) { |
705 | init(MemoryBufferRef(Input, "YAML")); | |
706 | } | |
707 | ||
708 | Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_) : SM(SM_) { | |
709 | init(Buffer); | |
710 | } | |
711 | ||
712 | void Scanner::init(MemoryBufferRef Buffer) { | |
713 | InputBuffer = Buffer; | |
714 | Current = InputBuffer.getBufferStart(); | |
715 | End = InputBuffer.getBufferEnd(); | |
716 | Indent = -1; | |
717 | Column = 0; | |
718 | Line = 0; | |
719 | FlowLevel = 0; | |
720 | IsStartOfStream = true; | |
721 | IsSimpleKeyAllowed = true; | |
722 | Failed = false; | |
723 | std::unique_ptr<MemoryBuffer> InputBufferOwner = | |
724 | MemoryBuffer::getMemBuffer(Buffer); | |
725 | SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); | |
970d7e83 LB |
726 | } |
727 | ||
223e47cc LB |
728 | Token &Scanner::peekNext() { |
729 | // If the current token is a possible simple key, keep parsing until we | |
730 | // can confirm. | |
731 | bool NeedMore = false; | |
732 | while (true) { | |
733 | if (TokenQueue.empty() || NeedMore) { | |
734 | if (!fetchMoreTokens()) { | |
735 | TokenQueue.clear(); | |
736 | TokenQueue.push_back(Token()); | |
737 | return TokenQueue.front(); | |
738 | } | |
739 | } | |
740 | assert(!TokenQueue.empty() && | |
741 | "fetchMoreTokens lied about getting tokens!"); | |
742 | ||
743 | removeStaleSimpleKeyCandidates(); | |
744 | SimpleKey SK; | |
745 | SK.Tok = TokenQueue.front(); | |
746 | if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) | |
747 | == SimpleKeys.end()) | |
748 | break; | |
749 | else | |
750 | NeedMore = true; | |
751 | } | |
752 | return TokenQueue.front(); | |
753 | } | |
754 | ||
755 | Token Scanner::getNext() { | |
756 | Token Ret = peekNext(); | |
757 | // TokenQueue can be empty if there was an error getting the next token. | |
758 | if (!TokenQueue.empty()) | |
759 | TokenQueue.pop_front(); | |
760 | ||
761 | // There cannot be any referenced Token's if the TokenQueue is empty. So do a | |
762 | // quick deallocation of them all. | |
763 | if (TokenQueue.empty()) { | |
764 | TokenQueue.Alloc.Reset(); | |
765 | } | |
766 | ||
767 | return Ret; | |
768 | } | |
769 | ||
770 | StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { | |
771 | if (Position == End) | |
772 | return Position; | |
773 | // Check 7 bit c-printable - b-char. | |
774 | if ( *Position == 0x09 | |
775 | || (*Position >= 0x20 && *Position <= 0x7E)) | |
776 | return Position + 1; | |
777 | ||
778 | // Check for valid UTF-8. | |
779 | if (uint8_t(*Position) & 0x80) { | |
780 | UTF8Decoded u8d = decodeUTF8(Position); | |
781 | if ( u8d.second != 0 | |
782 | && u8d.first != 0xFEFF | |
783 | && ( u8d.first == 0x85 | |
784 | || ( u8d.first >= 0xA0 | |
785 | && u8d.first <= 0xD7FF) | |
786 | || ( u8d.first >= 0xE000 | |
787 | && u8d.first <= 0xFFFD) | |
788 | || ( u8d.first >= 0x10000 | |
789 | && u8d.first <= 0x10FFFF))) | |
790 | return Position + u8d.second; | |
791 | } | |
792 | return Position; | |
793 | } | |
794 | ||
795 | StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { | |
796 | if (Position == End) | |
797 | return Position; | |
798 | if (*Position == 0x0D) { | |
799 | if (Position + 1 != End && *(Position + 1) == 0x0A) | |
800 | return Position + 2; | |
801 | return Position + 1; | |
802 | } | |
803 | ||
804 | if (*Position == 0x0A) | |
805 | return Position + 1; | |
806 | return Position; | |
807 | } | |
808 | ||
809 | ||
810 | StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { | |
811 | if (Position == End) | |
812 | return Position; | |
813 | if (*Position == ' ' || *Position == '\t') | |
814 | return Position + 1; | |
815 | return Position; | |
816 | } | |
817 | ||
818 | StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { | |
819 | if (Position == End) | |
820 | return Position; | |
821 | if (*Position == ' ' || *Position == '\t') | |
822 | return Position; | |
823 | return skip_nb_char(Position); | |
824 | } | |
825 | ||
826 | StringRef::iterator Scanner::skip_while( SkipWhileFunc Func | |
827 | , StringRef::iterator Position) { | |
828 | while (true) { | |
829 | StringRef::iterator i = (this->*Func)(Position); | |
830 | if (i == Position) | |
831 | break; | |
832 | Position = i; | |
833 | } | |
834 | return Position; | |
835 | } | |
836 | ||
837 | static bool is_ns_hex_digit(const char C) { | |
838 | return (C >= '0' && C <= '9') | |
839 | || (C >= 'a' && C <= 'z') | |
840 | || (C >= 'A' && C <= 'Z'); | |
841 | } | |
842 | ||
843 | static bool is_ns_word_char(const char C) { | |
844 | return C == '-' | |
845 | || (C >= 'a' && C <= 'z') | |
846 | || (C >= 'A' && C <= 'Z'); | |
847 | } | |
848 | ||
849 | StringRef Scanner::scan_ns_uri_char() { | |
850 | StringRef::iterator Start = Current; | |
851 | while (true) { | |
852 | if (Current == End) | |
853 | break; | |
854 | if (( *Current == '%' | |
855 | && Current + 2 < End | |
856 | && is_ns_hex_digit(*(Current + 1)) | |
857 | && is_ns_hex_digit(*(Current + 2))) | |
858 | || is_ns_word_char(*Current) | |
859 | || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") | |
860 | != StringRef::npos) { | |
861 | ++Current; | |
862 | ++Column; | |
863 | } else | |
864 | break; | |
865 | } | |
866 | return StringRef(Start, Current - Start); | |
867 | } | |
868 | ||
223e47cc LB |
869 | bool Scanner::consume(uint32_t Expected) { |
870 | if (Expected >= 0x80) | |
871 | report_fatal_error("Not dealing with this yet"); | |
872 | if (Current == End) | |
873 | return false; | |
874 | if (uint8_t(*Current) >= 0x80) | |
875 | report_fatal_error("Not dealing with this yet"); | |
876 | if (uint8_t(*Current) == Expected) { | |
877 | ++Current; | |
878 | ++Column; | |
879 | return true; | |
880 | } | |
881 | return false; | |
882 | } | |
883 | ||
884 | void Scanner::skip(uint32_t Distance) { | |
885 | Current += Distance; | |
886 | Column += Distance; | |
887 | assert(Current <= End && "Skipped past the end"); | |
888 | } | |
889 | ||
890 | bool Scanner::isBlankOrBreak(StringRef::iterator Position) { | |
891 | if (Position == End) | |
892 | return false; | |
893 | if ( *Position == ' ' || *Position == '\t' | |
894 | || *Position == '\r' || *Position == '\n') | |
895 | return true; | |
896 | return false; | |
897 | } | |
898 | ||
899 | void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok | |
900 | , unsigned AtColumn | |
901 | , bool IsRequired) { | |
902 | if (IsSimpleKeyAllowed) { | |
903 | SimpleKey SK; | |
904 | SK.Tok = Tok; | |
905 | SK.Line = Line; | |
906 | SK.Column = AtColumn; | |
907 | SK.IsRequired = IsRequired; | |
908 | SK.FlowLevel = FlowLevel; | |
909 | SimpleKeys.push_back(SK); | |
910 | } | |
911 | } | |
912 | ||
913 | void Scanner::removeStaleSimpleKeyCandidates() { | |
914 | for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); | |
915 | i != SimpleKeys.end();) { | |
916 | if (i->Line != Line || i->Column + 1024 < Column) { | |
917 | if (i->IsRequired) | |
918 | setError( "Could not find expected : for simple key" | |
919 | , i->Tok->Range.begin()); | |
920 | i = SimpleKeys.erase(i); | |
921 | } else | |
922 | ++i; | |
923 | } | |
924 | } | |
925 | ||
926 | void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { | |
927 | if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) | |
928 | SimpleKeys.pop_back(); | |
929 | } | |
930 | ||
931 | bool Scanner::unrollIndent(int ToColumn) { | |
932 | Token T; | |
933 | // Indentation is ignored in flow. | |
934 | if (FlowLevel != 0) | |
935 | return true; | |
936 | ||
937 | while (Indent > ToColumn) { | |
938 | T.Kind = Token::TK_BlockEnd; | |
939 | T.Range = StringRef(Current, 1); | |
940 | TokenQueue.push_back(T); | |
941 | Indent = Indents.pop_back_val(); | |
942 | } | |
943 | ||
944 | return true; | |
945 | } | |
946 | ||
947 | bool Scanner::rollIndent( int ToColumn | |
948 | , Token::TokenKind Kind | |
949 | , TokenQueueT::iterator InsertPoint) { | |
950 | if (FlowLevel) | |
951 | return true; | |
952 | if (Indent < ToColumn) { | |
953 | Indents.push_back(Indent); | |
954 | Indent = ToColumn; | |
955 | ||
956 | Token T; | |
957 | T.Kind = Kind; | |
958 | T.Range = StringRef(Current, 0); | |
959 | TokenQueue.insert(InsertPoint, T); | |
960 | } | |
961 | return true; | |
962 | } | |
963 | ||
964 | void Scanner::scanToNextToken() { | |
965 | while (true) { | |
966 | while (*Current == ' ' || *Current == '\t') { | |
967 | skip(1); | |
968 | } | |
969 | ||
970 | // Skip comment. | |
971 | if (*Current == '#') { | |
972 | while (true) { | |
973 | // This may skip more than one byte, thus Column is only incremented | |
974 | // for code points. | |
975 | StringRef::iterator i = skip_nb_char(Current); | |
976 | if (i == Current) | |
977 | break; | |
978 | Current = i; | |
979 | ++Column; | |
980 | } | |
981 | } | |
982 | ||
983 | // Skip EOL. | |
984 | StringRef::iterator i = skip_b_break(Current); | |
985 | if (i == Current) | |
986 | break; | |
987 | Current = i; | |
988 | ++Line; | |
989 | Column = 0; | |
990 | // New lines may start a simple key. | |
991 | if (!FlowLevel) | |
992 | IsSimpleKeyAllowed = true; | |
993 | } | |
994 | } | |
995 | ||
996 | bool Scanner::scanStreamStart() { | |
997 | IsStartOfStream = false; | |
998 | ||
999 | EncodingInfo EI = getUnicodeEncoding(currentInput()); | |
1000 | ||
1001 | Token T; | |
1002 | T.Kind = Token::TK_StreamStart; | |
1003 | T.Range = StringRef(Current, EI.second); | |
1004 | TokenQueue.push_back(T); | |
1005 | Current += EI.second; | |
1006 | return true; | |
1007 | } | |
1008 | ||
1009 | bool Scanner::scanStreamEnd() { | |
1010 | // Force an ending new line if one isn't present. | |
1011 | if (Column != 0) { | |
1012 | Column = 0; | |
1013 | ++Line; | |
1014 | } | |
1015 | ||
1016 | unrollIndent(-1); | |
1017 | SimpleKeys.clear(); | |
1018 | IsSimpleKeyAllowed = false; | |
1019 | ||
1020 | Token T; | |
1021 | T.Kind = Token::TK_StreamEnd; | |
1022 | T.Range = StringRef(Current, 0); | |
1023 | TokenQueue.push_back(T); | |
1024 | return true; | |
1025 | } | |
1026 | ||
1027 | bool Scanner::scanDirective() { | |
1028 | // Reset the indentation level. | |
1029 | unrollIndent(-1); | |
1030 | SimpleKeys.clear(); | |
1031 | IsSimpleKeyAllowed = false; | |
1032 | ||
1033 | StringRef::iterator Start = Current; | |
1034 | consume('%'); | |
1035 | StringRef::iterator NameStart = Current; | |
1036 | Current = skip_while(&Scanner::skip_ns_char, Current); | |
1037 | StringRef Name(NameStart, Current - NameStart); | |
1038 | Current = skip_while(&Scanner::skip_s_white, Current); | |
1a4d82fc JJ |
1039 | |
1040 | Token T; | |
223e47cc LB |
1041 | if (Name == "YAML") { |
1042 | Current = skip_while(&Scanner::skip_ns_char, Current); | |
223e47cc LB |
1043 | T.Kind = Token::TK_VersionDirective; |
1044 | T.Range = StringRef(Start, Current - Start); | |
1045 | TokenQueue.push_back(T); | |
1046 | return true; | |
1a4d82fc JJ |
1047 | } else if(Name == "TAG") { |
1048 | Current = skip_while(&Scanner::skip_ns_char, Current); | |
1049 | Current = skip_while(&Scanner::skip_s_white, Current); | |
1050 | Current = skip_while(&Scanner::skip_ns_char, Current); | |
1051 | T.Kind = Token::TK_TagDirective; | |
1052 | T.Range = StringRef(Start, Current - Start); | |
1053 | TokenQueue.push_back(T); | |
1054 | return true; | |
223e47cc LB |
1055 | } |
1056 | return false; | |
1057 | } | |
1058 | ||
1059 | bool Scanner::scanDocumentIndicator(bool IsStart) { | |
1060 | unrollIndent(-1); | |
1061 | SimpleKeys.clear(); | |
1062 | IsSimpleKeyAllowed = false; | |
1063 | ||
1064 | Token T; | |
1065 | T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; | |
1066 | T.Range = StringRef(Current, 3); | |
1067 | skip(3); | |
1068 | TokenQueue.push_back(T); | |
1069 | return true; | |
1070 | } | |
1071 | ||
1072 | bool Scanner::scanFlowCollectionStart(bool IsSequence) { | |
1073 | Token T; | |
1074 | T.Kind = IsSequence ? Token::TK_FlowSequenceStart | |
1075 | : Token::TK_FlowMappingStart; | |
1076 | T.Range = StringRef(Current, 1); | |
1077 | skip(1); | |
1078 | TokenQueue.push_back(T); | |
1079 | ||
1080 | // [ and { may begin a simple key. | |
1081 | saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); | |
1082 | ||
1083 | // And may also be followed by a simple key. | |
1084 | IsSimpleKeyAllowed = true; | |
1085 | ++FlowLevel; | |
1086 | return true; | |
1087 | } | |
1088 | ||
1089 | bool Scanner::scanFlowCollectionEnd(bool IsSequence) { | |
1090 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1091 | IsSimpleKeyAllowed = false; | |
1092 | Token T; | |
1093 | T.Kind = IsSequence ? Token::TK_FlowSequenceEnd | |
1094 | : Token::TK_FlowMappingEnd; | |
1095 | T.Range = StringRef(Current, 1); | |
1096 | skip(1); | |
1097 | TokenQueue.push_back(T); | |
1098 | if (FlowLevel) | |
1099 | --FlowLevel; | |
1100 | return true; | |
1101 | } | |
1102 | ||
1103 | bool Scanner::scanFlowEntry() { | |
1104 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1105 | IsSimpleKeyAllowed = true; | |
1106 | Token T; | |
1107 | T.Kind = Token::TK_FlowEntry; | |
1108 | T.Range = StringRef(Current, 1); | |
1109 | skip(1); | |
1110 | TokenQueue.push_back(T); | |
1111 | return true; | |
1112 | } | |
1113 | ||
1114 | bool Scanner::scanBlockEntry() { | |
1115 | rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); | |
1116 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1117 | IsSimpleKeyAllowed = true; | |
1118 | Token T; | |
1119 | T.Kind = Token::TK_BlockEntry; | |
1120 | T.Range = StringRef(Current, 1); | |
1121 | skip(1); | |
1122 | TokenQueue.push_back(T); | |
1123 | return true; | |
1124 | } | |
1125 | ||
1126 | bool Scanner::scanKey() { | |
1127 | if (!FlowLevel) | |
1128 | rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); | |
1129 | ||
1130 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); | |
1131 | IsSimpleKeyAllowed = !FlowLevel; | |
1132 | ||
1133 | Token T; | |
1134 | T.Kind = Token::TK_Key; | |
1135 | T.Range = StringRef(Current, 1); | |
1136 | skip(1); | |
1137 | TokenQueue.push_back(T); | |
1138 | return true; | |
1139 | } | |
1140 | ||
1141 | bool Scanner::scanValue() { | |
1142 | // If the previous token could have been a simple key, insert the key token | |
1143 | // into the token queue. | |
1144 | if (!SimpleKeys.empty()) { | |
1145 | SimpleKey SK = SimpleKeys.pop_back_val(); | |
1146 | Token T; | |
1147 | T.Kind = Token::TK_Key; | |
1148 | T.Range = SK.Tok->Range; | |
1149 | TokenQueueT::iterator i, e; | |
1150 | for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { | |
1151 | if (i == SK.Tok) | |
1152 | break; | |
1153 | } | |
1154 | assert(i != e && "SimpleKey not in token queue!"); | |
1155 | i = TokenQueue.insert(i, T); | |
1156 | ||
1157 | // We may also need to add a Block-Mapping-Start token. | |
1158 | rollIndent(SK.Column, Token::TK_BlockMappingStart, i); | |
1159 | ||
1160 | IsSimpleKeyAllowed = false; | |
1161 | } else { | |
1162 | if (!FlowLevel) | |
1163 | rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); | |
1164 | IsSimpleKeyAllowed = !FlowLevel; | |
1165 | } | |
1166 | ||
1167 | Token T; | |
1168 | T.Kind = Token::TK_Value; | |
1169 | T.Range = StringRef(Current, 1); | |
1170 | skip(1); | |
1171 | TokenQueue.push_back(T); | |
1172 | return true; | |
1173 | } | |
1174 | ||
1175 | // Forbidding inlining improves performance by roughly 20%. | |
1176 | // FIXME: Remove once llvm optimizes this to the faster version without hints. | |
1177 | LLVM_ATTRIBUTE_NOINLINE static bool | |
1178 | wasEscaped(StringRef::iterator First, StringRef::iterator Position); | |
1179 | ||
1180 | // Returns whether a character at 'Position' was escaped with a leading '\'. | |
1181 | // 'First' specifies the position of the first character in the string. | |
1182 | static bool wasEscaped(StringRef::iterator First, | |
1183 | StringRef::iterator Position) { | |
1184 | assert(Position - 1 >= First); | |
1185 | StringRef::iterator I = Position - 1; | |
1186 | // We calculate the number of consecutive '\'s before the current position | |
1187 | // by iterating backwards through our string. | |
1188 | while (I >= First && *I == '\\') --I; | |
1189 | // (Position - 1 - I) now contains the number of '\'s before the current | |
1190 | // position. If it is odd, the character at 'Position' was escaped. | |
1191 | return (Position - 1 - I) % 2 == 1; | |
1192 | } | |
1193 | ||
1194 | bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { | |
1195 | StringRef::iterator Start = Current; | |
1196 | unsigned ColStart = Column; | |
1197 | if (IsDoubleQuoted) { | |
1198 | do { | |
1199 | ++Current; | |
1200 | while (Current != End && *Current != '"') | |
1201 | ++Current; | |
1202 | // Repeat until the previous character was not a '\' or was an escaped | |
1203 | // backslash. | |
1204 | } while ( Current != End | |
1205 | && *(Current - 1) == '\\' | |
1206 | && wasEscaped(Start + 1, Current)); | |
1207 | } else { | |
1208 | skip(1); | |
1209 | while (true) { | |
1210 | // Skip a ' followed by another '. | |
1211 | if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { | |
1212 | skip(2); | |
1213 | continue; | |
1214 | } else if (*Current == '\'') | |
1215 | break; | |
1216 | StringRef::iterator i = skip_nb_char(Current); | |
1217 | if (i == Current) { | |
1218 | i = skip_b_break(Current); | |
1219 | if (i == Current) | |
1220 | break; | |
1221 | Current = i; | |
1222 | Column = 0; | |
1223 | ++Line; | |
1224 | } else { | |
1225 | if (i == End) | |
1226 | break; | |
1227 | Current = i; | |
1228 | ++Column; | |
1229 | } | |
1230 | } | |
1231 | } | |
1232 | ||
1233 | if (Current == End) { | |
1234 | setError("Expected quote at end of scalar", Current); | |
1235 | return false; | |
1236 | } | |
1237 | ||
1238 | skip(1); // Skip ending quote. | |
1239 | Token T; | |
1240 | T.Kind = Token::TK_Scalar; | |
1241 | T.Range = StringRef(Start, Current - Start); | |
1242 | TokenQueue.push_back(T); | |
1243 | ||
1244 | saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1245 | ||
1246 | IsSimpleKeyAllowed = false; | |
1247 | ||
1248 | return true; | |
1249 | } | |
1250 | ||
1251 | bool Scanner::scanPlainScalar() { | |
1252 | StringRef::iterator Start = Current; | |
1253 | unsigned ColStart = Column; | |
1254 | unsigned LeadingBlanks = 0; | |
1255 | assert(Indent >= -1 && "Indent must be >= -1 !"); | |
1256 | unsigned indent = static_cast<unsigned>(Indent + 1); | |
1257 | while (true) { | |
1258 | if (*Current == '#') | |
1259 | break; | |
1260 | ||
1261 | while (!isBlankOrBreak(Current)) { | |
1262 | if ( FlowLevel && *Current == ':' | |
1263 | && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { | |
1264 | setError("Found unexpected ':' while scanning a plain scalar", Current); | |
1265 | return false; | |
1266 | } | |
1267 | ||
1268 | // Check for the end of the plain scalar. | |
1269 | if ( (*Current == ':' && isBlankOrBreak(Current + 1)) | |
1270 | || ( FlowLevel | |
1271 | && (StringRef(Current, 1).find_first_of(",:?[]{}") | |
1272 | != StringRef::npos))) | |
1273 | break; | |
1274 | ||
1275 | StringRef::iterator i = skip_nb_char(Current); | |
1276 | if (i == Current) | |
1277 | break; | |
1278 | Current = i; | |
1279 | ++Column; | |
1280 | } | |
1281 | ||
1282 | // Are we at the end? | |
1283 | if (!isBlankOrBreak(Current)) | |
1284 | break; | |
1285 | ||
1286 | // Eat blanks. | |
1287 | StringRef::iterator Tmp = Current; | |
1288 | while (isBlankOrBreak(Tmp)) { | |
1289 | StringRef::iterator i = skip_s_white(Tmp); | |
1290 | if (i != Tmp) { | |
1291 | if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { | |
1292 | setError("Found invalid tab character in indentation", Tmp); | |
1293 | return false; | |
1294 | } | |
1295 | Tmp = i; | |
1296 | ++Column; | |
1297 | } else { | |
1298 | i = skip_b_break(Tmp); | |
1299 | if (!LeadingBlanks) | |
1300 | LeadingBlanks = 1; | |
1301 | Tmp = i; | |
1302 | Column = 0; | |
1303 | ++Line; | |
1304 | } | |
1305 | } | |
1306 | ||
1307 | if (!FlowLevel && Column < indent) | |
1308 | break; | |
1309 | ||
1310 | Current = Tmp; | |
1311 | } | |
1312 | if (Start == Current) { | |
1313 | setError("Got empty plain scalar", Start); | |
1314 | return false; | |
1315 | } | |
1316 | Token T; | |
1317 | T.Kind = Token::TK_Scalar; | |
1318 | T.Range = StringRef(Start, Current - Start); | |
1319 | TokenQueue.push_back(T); | |
1320 | ||
1321 | // Plain scalars can be simple keys. | |
1322 | saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1323 | ||
1324 | IsSimpleKeyAllowed = false; | |
1325 | ||
1326 | return true; | |
1327 | } | |
1328 | ||
1329 | bool Scanner::scanAliasOrAnchor(bool IsAlias) { | |
1330 | StringRef::iterator Start = Current; | |
1331 | unsigned ColStart = Column; | |
1332 | skip(1); | |
1333 | while(true) { | |
1334 | if ( *Current == '[' || *Current == ']' | |
1335 | || *Current == '{' || *Current == '}' | |
1336 | || *Current == ',' | |
1337 | || *Current == ':') | |
1338 | break; | |
1339 | StringRef::iterator i = skip_ns_char(Current); | |
1340 | if (i == Current) | |
1341 | break; | |
1342 | Current = i; | |
1343 | ++Column; | |
1344 | } | |
1345 | ||
1346 | if (Start == Current) { | |
1347 | setError("Got empty alias or anchor", Start); | |
1348 | return false; | |
1349 | } | |
1350 | ||
1351 | Token T; | |
1352 | T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; | |
1353 | T.Range = StringRef(Start, Current - Start); | |
1354 | TokenQueue.push_back(T); | |
1355 | ||
1356 | // Alias and anchors can be simple keys. | |
1357 | saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1358 | ||
1359 | IsSimpleKeyAllowed = false; | |
1360 | ||
1361 | return true; | |
1362 | } | |
1363 | ||
1364 | bool Scanner::scanBlockScalar(bool IsLiteral) { | |
1365 | StringRef::iterator Start = Current; | |
1366 | skip(1); // Eat | or > | |
1367 | while(true) { | |
1368 | StringRef::iterator i = skip_nb_char(Current); | |
1369 | if (i == Current) { | |
1370 | if (Column == 0) | |
1371 | break; | |
1372 | i = skip_b_break(Current); | |
1373 | if (i != Current) { | |
1374 | // We got a line break. | |
1375 | Column = 0; | |
1376 | ++Line; | |
1377 | Current = i; | |
1378 | continue; | |
1379 | } else { | |
1380 | // There was an error, which should already have been printed out. | |
1381 | return false; | |
1382 | } | |
1383 | } | |
1384 | Current = i; | |
1385 | ++Column; | |
1386 | } | |
1387 | ||
1388 | if (Start == Current) { | |
1389 | setError("Got empty block scalar", Start); | |
1390 | return false; | |
1391 | } | |
1392 | ||
1393 | Token T; | |
1394 | T.Kind = Token::TK_Scalar; | |
1395 | T.Range = StringRef(Start, Current - Start); | |
1396 | TokenQueue.push_back(T); | |
1397 | return true; | |
1398 | } | |
1399 | ||
1400 | bool Scanner::scanTag() { | |
1401 | StringRef::iterator Start = Current; | |
1402 | unsigned ColStart = Column; | |
1403 | skip(1); // Eat !. | |
1404 | if (Current == End || isBlankOrBreak(Current)); // An empty tag. | |
1405 | else if (*Current == '<') { | |
1406 | skip(1); | |
1407 | scan_ns_uri_char(); | |
1408 | if (!consume('>')) | |
1409 | return false; | |
1410 | } else { | |
1411 | // FIXME: Actually parse the c-ns-shorthand-tag rule. | |
1412 | Current = skip_while(&Scanner::skip_ns_char, Current); | |
1413 | } | |
1414 | ||
1415 | Token T; | |
1416 | T.Kind = Token::TK_Tag; | |
1417 | T.Range = StringRef(Start, Current - Start); | |
1418 | TokenQueue.push_back(T); | |
1419 | ||
1420 | // Tags can be simple keys. | |
1421 | saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); | |
1422 | ||
1423 | IsSimpleKeyAllowed = false; | |
1424 | ||
1425 | return true; | |
1426 | } | |
1427 | ||
1428 | bool Scanner::fetchMoreTokens() { | |
1429 | if (IsStartOfStream) | |
1430 | return scanStreamStart(); | |
1431 | ||
1432 | scanToNextToken(); | |
1433 | ||
1434 | if (Current == End) | |
1435 | return scanStreamEnd(); | |
1436 | ||
1437 | removeStaleSimpleKeyCandidates(); | |
1438 | ||
1439 | unrollIndent(Column); | |
1440 | ||
1441 | if (Column == 0 && *Current == '%') | |
1442 | return scanDirective(); | |
1443 | ||
1444 | if (Column == 0 && Current + 4 <= End | |
1445 | && *Current == '-' | |
1446 | && *(Current + 1) == '-' | |
1447 | && *(Current + 2) == '-' | |
1448 | && (Current + 3 == End || isBlankOrBreak(Current + 3))) | |
1449 | return scanDocumentIndicator(true); | |
1450 | ||
1451 | if (Column == 0 && Current + 4 <= End | |
1452 | && *Current == '.' | |
1453 | && *(Current + 1) == '.' | |
1454 | && *(Current + 2) == '.' | |
1455 | && (Current + 3 == End || isBlankOrBreak(Current + 3))) | |
1456 | return scanDocumentIndicator(false); | |
1457 | ||
1458 | if (*Current == '[') | |
1459 | return scanFlowCollectionStart(true); | |
1460 | ||
1461 | if (*Current == '{') | |
1462 | return scanFlowCollectionStart(false); | |
1463 | ||
1464 | if (*Current == ']') | |
1465 | return scanFlowCollectionEnd(true); | |
1466 | ||
1467 | if (*Current == '}') | |
1468 | return scanFlowCollectionEnd(false); | |
1469 | ||
1470 | if (*Current == ',') | |
1471 | return scanFlowEntry(); | |
1472 | ||
1473 | if (*Current == '-' && isBlankOrBreak(Current + 1)) | |
1474 | return scanBlockEntry(); | |
1475 | ||
1476 | if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) | |
1477 | return scanKey(); | |
1478 | ||
1479 | if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) | |
1480 | return scanValue(); | |
1481 | ||
1482 | if (*Current == '*') | |
1483 | return scanAliasOrAnchor(true); | |
1484 | ||
1485 | if (*Current == '&') | |
1486 | return scanAliasOrAnchor(false); | |
1487 | ||
1488 | if (*Current == '!') | |
1489 | return scanTag(); | |
1490 | ||
1491 | if (*Current == '|' && !FlowLevel) | |
1492 | return scanBlockScalar(true); | |
1493 | ||
1494 | if (*Current == '>' && !FlowLevel) | |
1495 | return scanBlockScalar(false); | |
1496 | ||
1497 | if (*Current == '\'') | |
1498 | return scanFlowScalar(false); | |
1499 | ||
1500 | if (*Current == '"') | |
1501 | return scanFlowScalar(true); | |
1502 | ||
1503 | // Get a plain scalar. | |
1504 | StringRef FirstChar(Current, 1); | |
1505 | if (!(isBlankOrBreak(Current) | |
1506 | || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) | |
1507 | || (*Current == '-' && !isBlankOrBreak(Current + 1)) | |
1508 | || (!FlowLevel && (*Current == '?' || *Current == ':') | |
1509 | && isBlankOrBreak(Current + 1)) | |
1510 | || (!FlowLevel && *Current == ':' | |
1511 | && Current + 2 < End | |
1512 | && *(Current + 1) == ':' | |
1513 | && !isBlankOrBreak(Current + 2))) | |
1514 | return scanPlainScalar(); | |
1515 | ||
1516 | setError("Unrecognized character while tokenizing."); | |
1517 | return false; | |
1518 | } | |
1519 | ||
1520 | Stream::Stream(StringRef Input, SourceMgr &SM) | |
1a4d82fc | 1521 | : scanner(new Scanner(Input, SM)), CurrentDoc() {} |
223e47cc | 1522 | |
1a4d82fc JJ |
1523 | Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM) |
1524 | : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {} | |
970d7e83 | 1525 | |
223e47cc LB |
1526 | Stream::~Stream() {} |
1527 | ||
1528 | bool Stream::failed() { return scanner->failed(); } | |
1529 | ||
1530 | void Stream::printError(Node *N, const Twine &Msg) { | |
1531 | SmallVector<SMRange, 1> Ranges; | |
1532 | Ranges.push_back(N->getSourceRange()); | |
1533 | scanner->printError( N->getSourceRange().Start | |
1534 | , SourceMgr::DK_Error | |
1535 | , Msg | |
1536 | , Ranges); | |
1537 | } | |
1538 | ||
223e47cc LB |
1539 | document_iterator Stream::begin() { |
1540 | if (CurrentDoc) | |
1541 | report_fatal_error("Can only iterate over the stream once"); | |
1542 | ||
1543 | // Skip Stream-Start. | |
1544 | scanner->getNext(); | |
1545 | ||
1546 | CurrentDoc.reset(new Document(*this)); | |
1547 | return document_iterator(CurrentDoc); | |
1548 | } | |
1549 | ||
1550 | document_iterator Stream::end() { | |
1551 | return document_iterator(); | |
1552 | } | |
1553 | ||
1554 | void Stream::skip() { | |
1555 | for (document_iterator i = begin(), e = end(); i != e; ++i) | |
1556 | i->skip(); | |
1557 | } | |
1558 | ||
1a4d82fc JJ |
1559 | Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, |
1560 | StringRef T) | |
1561 | : Doc(D), TypeID(Type), Anchor(A), Tag(T) { | |
223e47cc LB |
1562 | SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); |
1563 | SourceRange = SMRange(Start, Start); | |
1564 | } | |
1565 | ||
1a4d82fc JJ |
1566 | std::string Node::getVerbatimTag() const { |
1567 | StringRef Raw = getRawTag(); | |
1568 | if (!Raw.empty() && Raw != "!") { | |
1569 | std::string Ret; | |
1570 | if (Raw.find_last_of('!') == 0) { | |
1571 | Ret = Doc->getTagMap().find("!")->second; | |
1572 | Ret += Raw.substr(1); | |
1573 | return std::move(Ret); | |
1574 | } else if (Raw.startswith("!!")) { | |
1575 | Ret = Doc->getTagMap().find("!!")->second; | |
1576 | Ret += Raw.substr(2); | |
1577 | return std::move(Ret); | |
1578 | } else { | |
1579 | StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); | |
1580 | std::map<StringRef, StringRef>::const_iterator It = | |
1581 | Doc->getTagMap().find(TagHandle); | |
1582 | if (It != Doc->getTagMap().end()) | |
1583 | Ret = It->second; | |
1584 | else { | |
1585 | Token T; | |
1586 | T.Kind = Token::TK_Tag; | |
1587 | T.Range = TagHandle; | |
1588 | setError(Twine("Unknown tag handle ") + TagHandle, T); | |
1589 | } | |
1590 | Ret += Raw.substr(Raw.find_last_of('!') + 1); | |
1591 | return std::move(Ret); | |
1592 | } | |
1593 | } | |
1594 | ||
1595 | switch (getType()) { | |
1596 | case NK_Null: | |
1597 | return "tag:yaml.org,2002:null"; | |
1598 | case NK_Scalar: | |
1599 | // TODO: Tag resolution. | |
1600 | return "tag:yaml.org,2002:str"; | |
1601 | case NK_Mapping: | |
1602 | return "tag:yaml.org,2002:map"; | |
1603 | case NK_Sequence: | |
1604 | return "tag:yaml.org,2002:seq"; | |
1605 | } | |
1606 | ||
1607 | return ""; | |
1608 | } | |
1609 | ||
223e47cc LB |
1610 | Token &Node::peekNext() { |
1611 | return Doc->peekNext(); | |
1612 | } | |
1613 | ||
1614 | Token Node::getNext() { | |
1615 | return Doc->getNext(); | |
1616 | } | |
1617 | ||
1618 | Node *Node::parseBlockNode() { | |
1619 | return Doc->parseBlockNode(); | |
1620 | } | |
1621 | ||
1622 | BumpPtrAllocator &Node::getAllocator() { | |
1623 | return Doc->NodeAllocator; | |
1624 | } | |
1625 | ||
1626 | void Node::setError(const Twine &Msg, Token &Tok) const { | |
1627 | Doc->setError(Msg, Tok); | |
1628 | } | |
1629 | ||
1630 | bool Node::failed() const { | |
1631 | return Doc->failed(); | |
1632 | } | |
1633 | ||
1634 | ||
1635 | ||
1636 | StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { | |
1637 | // TODO: Handle newlines properly. We need to remove leading whitespace. | |
1638 | if (Value[0] == '"') { // Double quoted. | |
1639 | // Pull off the leading and trailing "s. | |
1640 | StringRef UnquotedValue = Value.substr(1, Value.size() - 2); | |
1641 | // Search for characters that would require unescaping the value. | |
1642 | StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); | |
1643 | if (i != StringRef::npos) | |
1644 | return unescapeDoubleQuoted(UnquotedValue, i, Storage); | |
1645 | return UnquotedValue; | |
1646 | } else if (Value[0] == '\'') { // Single quoted. | |
1647 | // Pull off the leading and trailing 's. | |
1648 | StringRef UnquotedValue = Value.substr(1, Value.size() - 2); | |
1649 | StringRef::size_type i = UnquotedValue.find('\''); | |
1650 | if (i != StringRef::npos) { | |
1651 | // We're going to need Storage. | |
1652 | Storage.clear(); | |
1653 | Storage.reserve(UnquotedValue.size()); | |
1654 | for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { | |
1655 | StringRef Valid(UnquotedValue.begin(), i); | |
1656 | Storage.insert(Storage.end(), Valid.begin(), Valid.end()); | |
1657 | Storage.push_back('\''); | |
1658 | UnquotedValue = UnquotedValue.substr(i + 2); | |
1659 | } | |
1660 | Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); | |
1661 | return StringRef(Storage.begin(), Storage.size()); | |
1662 | } | |
1663 | return UnquotedValue; | |
1664 | } | |
1665 | // Plain or block. | |
1666 | return Value.rtrim(" "); | |
1667 | } | |
1668 | ||
1669 | StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue | |
1670 | , StringRef::size_type i | |
1671 | , SmallVectorImpl<char> &Storage) | |
1672 | const { | |
1673 | // Use Storage to build proper value. | |
1674 | Storage.clear(); | |
1675 | Storage.reserve(UnquotedValue.size()); | |
1676 | for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { | |
1677 | // Insert all previous chars into Storage. | |
1678 | StringRef Valid(UnquotedValue.begin(), i); | |
1679 | Storage.insert(Storage.end(), Valid.begin(), Valid.end()); | |
1680 | // Chop off inserted chars. | |
1681 | UnquotedValue = UnquotedValue.substr(i); | |
1682 | ||
1683 | assert(!UnquotedValue.empty() && "Can't be empty!"); | |
1684 | ||
1685 | // Parse escape or line break. | |
1686 | switch (UnquotedValue[0]) { | |
1687 | case '\r': | |
1688 | case '\n': | |
1689 | Storage.push_back('\n'); | |
1690 | if ( UnquotedValue.size() > 1 | |
1691 | && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) | |
1692 | UnquotedValue = UnquotedValue.substr(1); | |
1693 | UnquotedValue = UnquotedValue.substr(1); | |
1694 | break; | |
1695 | default: | |
1696 | if (UnquotedValue.size() == 1) | |
1697 | // TODO: Report error. | |
1698 | break; | |
1699 | UnquotedValue = UnquotedValue.substr(1); | |
1700 | switch (UnquotedValue[0]) { | |
1701 | default: { | |
1702 | Token T; | |
1703 | T.Range = StringRef(UnquotedValue.begin(), 1); | |
1704 | setError("Unrecognized escape code!", T); | |
1705 | return ""; | |
1706 | } | |
1707 | case '\r': | |
1708 | case '\n': | |
1709 | // Remove the new line. | |
1710 | if ( UnquotedValue.size() > 1 | |
1711 | && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) | |
1712 | UnquotedValue = UnquotedValue.substr(1); | |
1713 | // If this was just a single byte newline, it will get skipped | |
1714 | // below. | |
1715 | break; | |
1716 | case '0': | |
1717 | Storage.push_back(0x00); | |
1718 | break; | |
1719 | case 'a': | |
1720 | Storage.push_back(0x07); | |
1721 | break; | |
1722 | case 'b': | |
1723 | Storage.push_back(0x08); | |
1724 | break; | |
1725 | case 't': | |
1726 | case 0x09: | |
1727 | Storage.push_back(0x09); | |
1728 | break; | |
1729 | case 'n': | |
1730 | Storage.push_back(0x0A); | |
1731 | break; | |
1732 | case 'v': | |
1733 | Storage.push_back(0x0B); | |
1734 | break; | |
1735 | case 'f': | |
1736 | Storage.push_back(0x0C); | |
1737 | break; | |
1738 | case 'r': | |
1739 | Storage.push_back(0x0D); | |
1740 | break; | |
1741 | case 'e': | |
1742 | Storage.push_back(0x1B); | |
1743 | break; | |
1744 | case ' ': | |
1745 | Storage.push_back(0x20); | |
1746 | break; | |
1747 | case '"': | |
1748 | Storage.push_back(0x22); | |
1749 | break; | |
1750 | case '/': | |
1751 | Storage.push_back(0x2F); | |
1752 | break; | |
1753 | case '\\': | |
1754 | Storage.push_back(0x5C); | |
1755 | break; | |
1756 | case 'N': | |
1757 | encodeUTF8(0x85, Storage); | |
1758 | break; | |
1759 | case '_': | |
1760 | encodeUTF8(0xA0, Storage); | |
1761 | break; | |
1762 | case 'L': | |
1763 | encodeUTF8(0x2028, Storage); | |
1764 | break; | |
1765 | case 'P': | |
1766 | encodeUTF8(0x2029, Storage); | |
1767 | break; | |
1768 | case 'x': { | |
1769 | if (UnquotedValue.size() < 3) | |
1770 | // TODO: Report error. | |
1771 | break; | |
1772 | unsigned int UnicodeScalarValue; | |
1773 | if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) | |
1774 | // TODO: Report error. | |
1775 | UnicodeScalarValue = 0xFFFD; | |
1776 | encodeUTF8(UnicodeScalarValue, Storage); | |
1777 | UnquotedValue = UnquotedValue.substr(2); | |
1778 | break; | |
1779 | } | |
1780 | case 'u': { | |
1781 | if (UnquotedValue.size() < 5) | |
1782 | // TODO: Report error. | |
1783 | break; | |
1784 | unsigned int UnicodeScalarValue; | |
1785 | if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) | |
1786 | // TODO: Report error. | |
1787 | UnicodeScalarValue = 0xFFFD; | |
1788 | encodeUTF8(UnicodeScalarValue, Storage); | |
1789 | UnquotedValue = UnquotedValue.substr(4); | |
1790 | break; | |
1791 | } | |
1792 | case 'U': { | |
1793 | if (UnquotedValue.size() < 9) | |
1794 | // TODO: Report error. | |
1795 | break; | |
1796 | unsigned int UnicodeScalarValue; | |
1797 | if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) | |
1798 | // TODO: Report error. | |
1799 | UnicodeScalarValue = 0xFFFD; | |
1800 | encodeUTF8(UnicodeScalarValue, Storage); | |
1801 | UnquotedValue = UnquotedValue.substr(8); | |
1802 | break; | |
1803 | } | |
1804 | } | |
1805 | UnquotedValue = UnquotedValue.substr(1); | |
1806 | } | |
1807 | } | |
1808 | Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); | |
1809 | return StringRef(Storage.begin(), Storage.size()); | |
1810 | } | |
1811 | ||
1812 | Node *KeyValueNode::getKey() { | |
1813 | if (Key) | |
1814 | return Key; | |
1815 | // Handle implicit null keys. | |
1816 | { | |
1817 | Token &t = peekNext(); | |
1818 | if ( t.Kind == Token::TK_BlockEnd | |
1819 | || t.Kind == Token::TK_Value | |
1820 | || t.Kind == Token::TK_Error) { | |
1821 | return Key = new (getAllocator()) NullNode(Doc); | |
1822 | } | |
1823 | if (t.Kind == Token::TK_Key) | |
1824 | getNext(); // skip TK_Key. | |
1825 | } | |
1826 | ||
1827 | // Handle explicit null keys. | |
1828 | Token &t = peekNext(); | |
1829 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { | |
1830 | return Key = new (getAllocator()) NullNode(Doc); | |
1831 | } | |
1832 | ||
1833 | // We've got a normal key. | |
1834 | return Key = parseBlockNode(); | |
1835 | } | |
1836 | ||
1837 | Node *KeyValueNode::getValue() { | |
1838 | if (Value) | |
1839 | return Value; | |
1840 | getKey()->skip(); | |
1841 | if (failed()) | |
1842 | return Value = new (getAllocator()) NullNode(Doc); | |
1843 | ||
1844 | // Handle implicit null values. | |
1845 | { | |
1846 | Token &t = peekNext(); | |
1847 | if ( t.Kind == Token::TK_BlockEnd | |
1848 | || t.Kind == Token::TK_FlowMappingEnd | |
1849 | || t.Kind == Token::TK_Key | |
1850 | || t.Kind == Token::TK_FlowEntry | |
1851 | || t.Kind == Token::TK_Error) { | |
1852 | return Value = new (getAllocator()) NullNode(Doc); | |
1853 | } | |
1854 | ||
1855 | if (t.Kind != Token::TK_Value) { | |
1856 | setError("Unexpected token in Key Value.", t); | |
1857 | return Value = new (getAllocator()) NullNode(Doc); | |
1858 | } | |
1859 | getNext(); // skip TK_Value. | |
1860 | } | |
1861 | ||
1862 | // Handle explicit null values. | |
1863 | Token &t = peekNext(); | |
1864 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { | |
1865 | return Value = new (getAllocator()) NullNode(Doc); | |
1866 | } | |
1867 | ||
1868 | // We got a normal value. | |
1869 | return Value = parseBlockNode(); | |
1870 | } | |
1871 | ||
1872 | void MappingNode::increment() { | |
1873 | if (failed()) { | |
1874 | IsAtEnd = true; | |
1a4d82fc | 1875 | CurrentEntry = nullptr; |
223e47cc LB |
1876 | return; |
1877 | } | |
1878 | if (CurrentEntry) { | |
1879 | CurrentEntry->skip(); | |
1880 | if (Type == MT_Inline) { | |
1881 | IsAtEnd = true; | |
1a4d82fc | 1882 | CurrentEntry = nullptr; |
223e47cc LB |
1883 | return; |
1884 | } | |
1885 | } | |
1886 | Token T = peekNext(); | |
1887 | if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { | |
1888 | // KeyValueNode eats the TK_Key. That way it can detect null keys. | |
1889 | CurrentEntry = new (getAllocator()) KeyValueNode(Doc); | |
1890 | } else if (Type == MT_Block) { | |
1891 | switch (T.Kind) { | |
1892 | case Token::TK_BlockEnd: | |
1893 | getNext(); | |
1894 | IsAtEnd = true; | |
1a4d82fc | 1895 | CurrentEntry = nullptr; |
223e47cc LB |
1896 | break; |
1897 | default: | |
1898 | setError("Unexpected token. Expected Key or Block End", T); | |
1899 | case Token::TK_Error: | |
1900 | IsAtEnd = true; | |
1a4d82fc | 1901 | CurrentEntry = nullptr; |
223e47cc LB |
1902 | } |
1903 | } else { | |
1904 | switch (T.Kind) { | |
1905 | case Token::TK_FlowEntry: | |
1906 | // Eat the flow entry and recurse. | |
1907 | getNext(); | |
1908 | return increment(); | |
1909 | case Token::TK_FlowMappingEnd: | |
1910 | getNext(); | |
1911 | case Token::TK_Error: | |
1912 | // Set this to end iterator. | |
1913 | IsAtEnd = true; | |
1a4d82fc | 1914 | CurrentEntry = nullptr; |
223e47cc LB |
1915 | break; |
1916 | default: | |
1917 | setError( "Unexpected token. Expected Key, Flow Entry, or Flow " | |
1918 | "Mapping End." | |
1919 | , T); | |
1920 | IsAtEnd = true; | |
1a4d82fc | 1921 | CurrentEntry = nullptr; |
223e47cc LB |
1922 | } |
1923 | } | |
1924 | } | |
1925 | ||
1926 | void SequenceNode::increment() { | |
1927 | if (failed()) { | |
1928 | IsAtEnd = true; | |
1a4d82fc | 1929 | CurrentEntry = nullptr; |
223e47cc LB |
1930 | return; |
1931 | } | |
1932 | if (CurrentEntry) | |
1933 | CurrentEntry->skip(); | |
1934 | Token T = peekNext(); | |
1935 | if (SeqType == ST_Block) { | |
1936 | switch (T.Kind) { | |
1937 | case Token::TK_BlockEntry: | |
1938 | getNext(); | |
1939 | CurrentEntry = parseBlockNode(); | |
1a4d82fc | 1940 | if (!CurrentEntry) { // An error occurred. |
223e47cc | 1941 | IsAtEnd = true; |
1a4d82fc | 1942 | CurrentEntry = nullptr; |
223e47cc LB |
1943 | } |
1944 | break; | |
1945 | case Token::TK_BlockEnd: | |
1946 | getNext(); | |
1947 | IsAtEnd = true; | |
1a4d82fc | 1948 | CurrentEntry = nullptr; |
223e47cc LB |
1949 | break; |
1950 | default: | |
1951 | setError( "Unexpected token. Expected Block Entry or Block End." | |
1952 | , T); | |
1953 | case Token::TK_Error: | |
1954 | IsAtEnd = true; | |
1a4d82fc | 1955 | CurrentEntry = nullptr; |
223e47cc LB |
1956 | } |
1957 | } else if (SeqType == ST_Indentless) { | |
1958 | switch (T.Kind) { | |
1959 | case Token::TK_BlockEntry: | |
1960 | getNext(); | |
1961 | CurrentEntry = parseBlockNode(); | |
1a4d82fc | 1962 | if (!CurrentEntry) { // An error occurred. |
223e47cc | 1963 | IsAtEnd = true; |
1a4d82fc | 1964 | CurrentEntry = nullptr; |
223e47cc LB |
1965 | } |
1966 | break; | |
1967 | default: | |
1968 | case Token::TK_Error: | |
1969 | IsAtEnd = true; | |
1a4d82fc | 1970 | CurrentEntry = nullptr; |
223e47cc LB |
1971 | } |
1972 | } else if (SeqType == ST_Flow) { | |
1973 | switch (T.Kind) { | |
1974 | case Token::TK_FlowEntry: | |
1975 | // Eat the flow entry and recurse. | |
1976 | getNext(); | |
1977 | WasPreviousTokenFlowEntry = true; | |
1978 | return increment(); | |
1979 | case Token::TK_FlowSequenceEnd: | |
1980 | getNext(); | |
1981 | case Token::TK_Error: | |
1982 | // Set this to end iterator. | |
1983 | IsAtEnd = true; | |
1a4d82fc | 1984 | CurrentEntry = nullptr; |
223e47cc LB |
1985 | break; |
1986 | case Token::TK_StreamEnd: | |
1987 | case Token::TK_DocumentEnd: | |
1988 | case Token::TK_DocumentStart: | |
1989 | setError("Could not find closing ]!", T); | |
1990 | // Set this to end iterator. | |
1991 | IsAtEnd = true; | |
1a4d82fc | 1992 | CurrentEntry = nullptr; |
223e47cc LB |
1993 | break; |
1994 | default: | |
1995 | if (!WasPreviousTokenFlowEntry) { | |
1996 | setError("Expected , between entries!", T); | |
1997 | IsAtEnd = true; | |
1a4d82fc | 1998 | CurrentEntry = nullptr; |
223e47cc LB |
1999 | break; |
2000 | } | |
2001 | // Otherwise it must be a flow entry. | |
2002 | CurrentEntry = parseBlockNode(); | |
2003 | if (!CurrentEntry) { | |
2004 | IsAtEnd = true; | |
2005 | } | |
2006 | WasPreviousTokenFlowEntry = false; | |
2007 | break; | |
2008 | } | |
2009 | } | |
2010 | } | |
2011 | ||
1a4d82fc JJ |
2012 | Document::Document(Stream &S) : stream(S), Root(nullptr) { |
2013 | // Tag maps starts with two default mappings. | |
2014 | TagMap["!"] = "!"; | |
2015 | TagMap["!!"] = "tag:yaml.org,2002:"; | |
2016 | ||
223e47cc LB |
2017 | if (parseDirectives()) |
2018 | expectToken(Token::TK_DocumentStart); | |
2019 | Token &T = peekNext(); | |
2020 | if (T.Kind == Token::TK_DocumentStart) | |
2021 | getNext(); | |
2022 | } | |
2023 | ||
2024 | bool Document::skip() { | |
2025 | if (stream.scanner->failed()) | |
2026 | return false; | |
2027 | if (!Root) | |
2028 | getRoot(); | |
2029 | Root->skip(); | |
2030 | Token &T = peekNext(); | |
2031 | if (T.Kind == Token::TK_StreamEnd) | |
2032 | return false; | |
2033 | if (T.Kind == Token::TK_DocumentEnd) { | |
2034 | getNext(); | |
2035 | return skip(); | |
2036 | } | |
2037 | return true; | |
2038 | } | |
2039 | ||
2040 | Token &Document::peekNext() { | |
2041 | return stream.scanner->peekNext(); | |
2042 | } | |
2043 | ||
2044 | Token Document::getNext() { | |
2045 | return stream.scanner->getNext(); | |
2046 | } | |
2047 | ||
2048 | void Document::setError(const Twine &Message, Token &Location) const { | |
2049 | stream.scanner->setError(Message, Location.Range.begin()); | |
2050 | } | |
2051 | ||
2052 | bool Document::failed() const { | |
2053 | return stream.scanner->failed(); | |
2054 | } | |
2055 | ||
2056 | Node *Document::parseBlockNode() { | |
2057 | Token T = peekNext(); | |
2058 | // Handle properties. | |
2059 | Token AnchorInfo; | |
1a4d82fc | 2060 | Token TagInfo; |
223e47cc LB |
2061 | parse_property: |
2062 | switch (T.Kind) { | |
2063 | case Token::TK_Alias: | |
2064 | getNext(); | |
2065 | return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); | |
2066 | case Token::TK_Anchor: | |
2067 | if (AnchorInfo.Kind == Token::TK_Anchor) { | |
2068 | setError("Already encountered an anchor for this node!", T); | |
1a4d82fc | 2069 | return nullptr; |
223e47cc LB |
2070 | } |
2071 | AnchorInfo = getNext(); // Consume TK_Anchor. | |
2072 | T = peekNext(); | |
2073 | goto parse_property; | |
2074 | case Token::TK_Tag: | |
1a4d82fc JJ |
2075 | if (TagInfo.Kind == Token::TK_Tag) { |
2076 | setError("Already encountered a tag for this node!", T); | |
2077 | return nullptr; | |
2078 | } | |
2079 | TagInfo = getNext(); // Consume TK_Tag. | |
223e47cc LB |
2080 | T = peekNext(); |
2081 | goto parse_property; | |
2082 | default: | |
2083 | break; | |
2084 | } | |
2085 | ||
2086 | switch (T.Kind) { | |
2087 | case Token::TK_BlockEntry: | |
2088 | // We got an unindented BlockEntry sequence. This is not terminated with | |
2089 | // a BlockEnd. | |
2090 | // Don't eat the TK_BlockEntry, SequenceNode needs it. | |
2091 | return new (NodeAllocator) SequenceNode( stream.CurrentDoc | |
2092 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2093 | , TagInfo.Range |
223e47cc LB |
2094 | , SequenceNode::ST_Indentless); |
2095 | case Token::TK_BlockSequenceStart: | |
2096 | getNext(); | |
2097 | return new (NodeAllocator) | |
2098 | SequenceNode( stream.CurrentDoc | |
2099 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2100 | , TagInfo.Range |
223e47cc LB |
2101 | , SequenceNode::ST_Block); |
2102 | case Token::TK_BlockMappingStart: | |
2103 | getNext(); | |
2104 | return new (NodeAllocator) | |
2105 | MappingNode( stream.CurrentDoc | |
2106 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2107 | , TagInfo.Range |
223e47cc LB |
2108 | , MappingNode::MT_Block); |
2109 | case Token::TK_FlowSequenceStart: | |
2110 | getNext(); | |
2111 | return new (NodeAllocator) | |
2112 | SequenceNode( stream.CurrentDoc | |
2113 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2114 | , TagInfo.Range |
223e47cc LB |
2115 | , SequenceNode::ST_Flow); |
2116 | case Token::TK_FlowMappingStart: | |
2117 | getNext(); | |
2118 | return new (NodeAllocator) | |
2119 | MappingNode( stream.CurrentDoc | |
2120 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2121 | , TagInfo.Range |
223e47cc LB |
2122 | , MappingNode::MT_Flow); |
2123 | case Token::TK_Scalar: | |
2124 | getNext(); | |
2125 | return new (NodeAllocator) | |
2126 | ScalarNode( stream.CurrentDoc | |
2127 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2128 | , TagInfo.Range |
223e47cc LB |
2129 | , T.Range); |
2130 | case Token::TK_Key: | |
2131 | // Don't eat the TK_Key, KeyValueNode expects it. | |
2132 | return new (NodeAllocator) | |
2133 | MappingNode( stream.CurrentDoc | |
2134 | , AnchorInfo.Range.substr(1) | |
1a4d82fc | 2135 | , TagInfo.Range |
223e47cc LB |
2136 | , MappingNode::MT_Inline); |
2137 | case Token::TK_DocumentStart: | |
2138 | case Token::TK_DocumentEnd: | |
2139 | case Token::TK_StreamEnd: | |
2140 | default: | |
2141 | // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not | |
2142 | // !!null null. | |
2143 | return new (NodeAllocator) NullNode(stream.CurrentDoc); | |
2144 | case Token::TK_Error: | |
1a4d82fc | 2145 | return nullptr; |
223e47cc LB |
2146 | } |
2147 | llvm_unreachable("Control flow shouldn't reach here."); | |
1a4d82fc | 2148 | return nullptr; |
223e47cc LB |
2149 | } |
2150 | ||
2151 | bool Document::parseDirectives() { | |
2152 | bool isDirective = false; | |
2153 | while (true) { | |
2154 | Token T = peekNext(); | |
2155 | if (T.Kind == Token::TK_TagDirective) { | |
1a4d82fc | 2156 | parseTAGDirective(); |
223e47cc LB |
2157 | isDirective = true; |
2158 | } else if (T.Kind == Token::TK_VersionDirective) { | |
1a4d82fc | 2159 | parseYAMLDirective(); |
223e47cc LB |
2160 | isDirective = true; |
2161 | } else | |
2162 | break; | |
2163 | } | |
2164 | return isDirective; | |
2165 | } | |
2166 | ||
1a4d82fc JJ |
2167 | void Document::parseYAMLDirective() { |
2168 | getNext(); // Eat %YAML <version> | |
2169 | } | |
2170 | ||
2171 | void Document::parseTAGDirective() { | |
2172 | Token Tag = getNext(); // %TAG <handle> <prefix> | |
2173 | StringRef T = Tag.Range; | |
2174 | // Strip %TAG | |
2175 | T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); | |
2176 | std::size_t HandleEnd = T.find_first_of(" \t"); | |
2177 | StringRef TagHandle = T.substr(0, HandleEnd); | |
2178 | StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); | |
2179 | TagMap[TagHandle] = TagPrefix; | |
2180 | } | |
2181 | ||
223e47cc LB |
2182 | bool Document::expectToken(int TK) { |
2183 | Token T = getNext(); | |
2184 | if (T.Kind != TK) { | |
2185 | setError("Unexpected token", T); | |
2186 | return false; | |
2187 | } | |
2188 | return true; | |
2189 | } |