]> git.proxmox.com Git - rustc.git/blob - src/llvm/unittests/Support/ConvertUTFTest.cpp
Imported Upstream version 1.0.0+dfsg1
[rustc.git] / src / llvm / unittests / Support / ConvertUTFTest.cpp
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/Support/ConvertUTF.h"
11 #include "gtest/gtest.h"
12 #include <string>
13 #include <utility>
14 #include <vector>
15
16 using namespace llvm;
17
18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19 // Src is the look of disapproval.
20 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22 std::string Result;
23 bool Success = convertUTF16ToUTF8String(Ref, Result);
24 EXPECT_TRUE(Success);
25 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26 EXPECT_EQ(Expected, Result);
27 }
28
29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30 // Src is the look of disapproval.
31 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF16ToUTF8String(Ref, Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37 EXPECT_EQ(Expected, Result);
38 }
39
40 TEST(ConvertUTFTest, OddLengthInput) {
41 std::string Result;
42 bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
43 EXPECT_FALSE(Success);
44 }
45
46 TEST(ConvertUTFTest, Empty) {
47 std::string Result;
48 bool Success = convertUTF16ToUTF8String(None, Result);
49 EXPECT_TRUE(Success);
50 EXPECT_TRUE(Result.empty());
51 }
52
53 TEST(ConvertUTFTest, HasUTF16BOM) {
54 bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
55 EXPECT_TRUE(HasBOM);
56 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
57 EXPECT_TRUE(HasBOM);
58 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
59 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
60 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
61 EXPECT_TRUE(HasBOM);
62
63 HasBOM = hasUTF16ByteOrderMark(None);
64 EXPECT_FALSE(HasBOM);
65 HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
66 EXPECT_FALSE(HasBOM);
67 }
68
69 struct ConvertUTFResultContainer {
70 ConversionResult ErrorCode;
71 std::vector<unsigned> UnicodeScalars;
72
73 ConvertUTFResultContainer(ConversionResult ErrorCode)
74 : ErrorCode(ErrorCode) {}
75
76 ConvertUTFResultContainer
77 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
78 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
79 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
80 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
81 ConvertUTFResultContainer Result(*this);
82 if (US0 != 0x110000)
83 Result.UnicodeScalars.push_back(US0);
84 if (US1 != 0x110000)
85 Result.UnicodeScalars.push_back(US1);
86 if (US2 != 0x110000)
87 Result.UnicodeScalars.push_back(US2);
88 if (US3 != 0x110000)
89 Result.UnicodeScalars.push_back(US3);
90 if (US4 != 0x110000)
91 Result.UnicodeScalars.push_back(US4);
92 if (US5 != 0x110000)
93 Result.UnicodeScalars.push_back(US5);
94 if (US6 != 0x110000)
95 Result.UnicodeScalars.push_back(US6);
96 if (US7 != 0x110000)
97 Result.UnicodeScalars.push_back(US7);
98 return Result;
99 }
100 };
101
102 std::pair<ConversionResult, std::vector<unsigned>>
103 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
104 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
105
106 const UTF8 *SourceNext = SourceStart;
107 std::vector<UTF32> Decoded(S.size(), 0);
108 UTF32 *TargetStart = Decoded.data();
109
110 auto ErrorCode =
111 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
112 Decoded.data() + Decoded.size(), lenientConversion);
113
114 Decoded.resize(TargetStart - Decoded.data());
115
116 return std::make_pair(ErrorCode, Decoded);
117 }
118
119 std::pair<ConversionResult, std::vector<unsigned>>
120 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
121 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
122
123 const UTF8 *SourceNext = SourceStart;
124 std::vector<UTF32> Decoded(S.size(), 0);
125 UTF32 *TargetStart = Decoded.data();
126
127 auto ErrorCode = ConvertUTF8toUTF32Partial(
128 &SourceNext, SourceStart + S.size(), &TargetStart,
129 Decoded.data() + Decoded.size(), lenientConversion);
130
131 Decoded.resize(TargetStart - Decoded.data());
132
133 return std::make_pair(ErrorCode, Decoded);
134 }
135
136 ::testing::AssertionResult
137 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
138 StringRef S, bool Partial = false) {
139 ConversionResult ErrorCode;
140 std::vector<unsigned> Decoded;
141 if (!Partial)
142 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
143 else
144 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
145
146 if (Expected.ErrorCode != ErrorCode)
147 return ::testing::AssertionFailure() << "Expected error code "
148 << Expected.ErrorCode << ", actual "
149 << ErrorCode;
150
151 if (Expected.UnicodeScalars != Decoded)
152 return ::testing::AssertionFailure()
153 << "Expected lenient decoded result:\n"
154 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
155 << "Actual result:\n" << ::testing::PrintToString(Decoded);
156
157 return ::testing::AssertionSuccess();
158 }
159
160 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
161
162 //
163 // 1-byte sequences
164 //
165
166 // U+0041 LATIN CAPITAL LETTER A
167 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
168 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
169
170 //
171 // 2-byte sequences
172 //
173
174 // U+0283 LATIN SMALL LETTER ESH
175 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
176 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
177 "\xca\x83"));
178
179 // U+03BA GREEK SMALL LETTER KAPPA
180 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
181 // U+03C3 GREEK SMALL LETTER SIGMA
182 // U+03BC GREEK SMALL LETTER MU
183 // U+03B5 GREEK SMALL LETTER EPSILON
184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
185 ConvertUTFResultContainer(conversionOK)
186 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
187 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
188
189 //
190 // 3-byte sequences
191 //
192
193 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
194 // U+6587 CJK UNIFIED IDEOGRAPH-6587
195 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
196 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
197 "\xe4\xbe\x8b\xe6\x96\x87"));
198
199 // U+D55C HANGUL SYLLABLE HAN
200 // U+AE00 HANGUL SYLLABLE GEUL
201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
202 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
203 "\xed\x95\x9c\xea\xb8\x80"));
204
205 // U+1112 HANGUL CHOSEONG HIEUH
206 // U+1161 HANGUL JUNGSEONG A
207 // U+11AB HANGUL JONGSEONG NIEUN
208 // U+1100 HANGUL CHOSEONG KIYEOK
209 // U+1173 HANGUL JUNGSEONG EU
210 // U+11AF HANGUL JONGSEONG RIEUL
211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
212 ConvertUTFResultContainer(conversionOK)
213 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
214 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
215 "\xe1\x86\xaf"));
216
217 //
218 // 4-byte sequences
219 //
220
221 // U+E0100 VARIATION SELECTOR-17
222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
224 "\xf3\xa0\x84\x80"));
225
226 //
227 // First possible sequence of a certain length
228 //
229
230 // U+0000 NULL
231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
233 StringRef("\x00", 1)));
234
235 // U+0080 PADDING CHARACTER
236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
238 "\xc2\x80"));
239
240 // U+0800 SAMARITAN LETTER ALAF
241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
242 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
243 "\xe0\xa0\x80"));
244
245 // U+10000 LINEAR B SYLLABLE B008 A
246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
248 "\xf0\x90\x80\x80"));
249
250 // U+200000 (invalid)
251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
252 ConvertUTFResultContainer(sourceIllegal)
253 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
254 "\xf8\x88\x80\x80\x80"));
255
256 // U+4000000 (invalid)
257 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
258 ConvertUTFResultContainer(sourceIllegal)
259 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
260 "\xfc\x84\x80\x80\x80\x80"));
261
262 //
263 // Last possible sequence of a certain length
264 //
265
266 // U+007F DELETE
267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
269
270 // U+07FF (unassigned)
271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
273 "\xdf\xbf"));
274
275 // U+FFFF (noncharacter)
276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
277 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
278 "\xef\xbf\xbf"));
279
280 // U+1FFFFF (invalid)
281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282 ConvertUTFResultContainer(sourceIllegal)
283 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
284 "\xf7\xbf\xbf\xbf"));
285
286 // U+3FFFFFF (invalid)
287 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
288 ConvertUTFResultContainer(sourceIllegal)
289 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
290 "\xfb\xbf\xbf\xbf\xbf"));
291
292 // U+7FFFFFFF (invalid)
293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294 ConvertUTFResultContainer(sourceIllegal)
295 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
296 "\xfd\xbf\xbf\xbf\xbf\xbf"));
297
298 //
299 // Other boundary conditions
300 //
301
302 // U+D7FF (unassigned)
303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
305 "\xed\x9f\xbf"));
306
307 // U+E000 (private use)
308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
310 "\xee\x80\x80"));
311
312 // U+FFFD REPLACEMENT CHARACTER
313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
315 "\xef\xbf\xbd"));
316
317 // U+10FFFF (noncharacter)
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
320 "\xf4\x8f\xbf\xbf"));
321
322 // U+110000 (invalid)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(sourceIllegal)
325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
326 "\xf4\x90\x80\x80"));
327
328 //
329 // Unexpected continuation bytes
330 //
331
332 // A sequence of unexpected continuation bytes that don't follow a first
333 // byte, every byte is a maximal subpart.
334
335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
341 "\x80\x80"));
342 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
344 "\x80\xbf"));
345 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
346 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
347 "\xbf\x80"));
348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349 ConvertUTFResultContainer(sourceIllegal)
350 .withScalars(0xfffd, 0xfffd, 0xfffd),
351 "\x80\xbf\x80"));
352 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353 ConvertUTFResultContainer(sourceIllegal)
354 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355 "\x80\xbf\x80\xbf"));
356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357 ConvertUTFResultContainer(sourceIllegal)
358 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
359 "\x80\xbf\x82\xbf\xaa"));
360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361 ConvertUTFResultContainer(sourceIllegal)
362 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
363 "\xaa\xb0\xbb\xbf\xaa\xa0"));
364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365 ConvertUTFResultContainer(sourceIllegal)
366 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
368
369 // All continuation bytes (0x80--0xbf).
370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371 ConvertUTFResultContainer(sourceIllegal)
372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
373 0xfffd, 0xfffd, 0xfffd, 0xfffd)
374 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
375 0xfffd, 0xfffd, 0xfffd, 0xfffd)
376 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
377 0xfffd, 0xfffd, 0xfffd, 0xfffd)
378 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
379 0xfffd, 0xfffd, 0xfffd, 0xfffd)
380 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
381 0xfffd, 0xfffd, 0xfffd, 0xfffd)
382 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
383 0xfffd, 0xfffd, 0xfffd, 0xfffd)
384 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
385 0xfffd, 0xfffd, 0xfffd, 0xfffd)
386 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387 0xfffd, 0xfffd, 0xfffd, 0xfffd),
388 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
389 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
390 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
391 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
392
393 //
394 // Lonely start bytes
395 //
396
397 // Start bytes of 2-byte sequences (0xc0--0xdf).
398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
399 ConvertUTFResultContainer(sourceIllegal)
400 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401 0xfffd, 0xfffd, 0xfffd, 0xfffd)
402 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
403 0xfffd, 0xfffd, 0xfffd, 0xfffd)
404 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
405 0xfffd, 0xfffd, 0xfffd, 0xfffd)
406 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
407 0xfffd, 0xfffd, 0xfffd, 0xfffd),
408 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
409 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
410
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal)
413 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
414 0xfffd, 0x0020, 0xfffd, 0x0020)
415 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
416 0xfffd, 0x0020, 0xfffd, 0x0020)
417 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
418 0xfffd, 0x0020, 0xfffd, 0x0020)
419 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
420 0xfffd, 0x0020, 0xfffd, 0x0020)
421 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
422 0xfffd, 0x0020, 0xfffd, 0x0020)
423 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
424 0xfffd, 0x0020, 0xfffd, 0x0020)
425 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
426 0xfffd, 0x0020, 0xfffd, 0x0020)
427 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428 0xfffd, 0x0020, 0xfffd, 0x0020),
429 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
430 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
431 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
432 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
433
434 // Start bytes of 3-byte sequences (0xe0--0xef).
435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436 ConvertUTFResultContainer(sourceIllegal)
437 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
438 0xfffd, 0xfffd, 0xfffd, 0xfffd)
439 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
440 0xfffd, 0xfffd, 0xfffd, 0xfffd),
441 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
442
443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
444 ConvertUTFResultContainer(sourceIllegal)
445 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
446 0xfffd, 0x0020, 0xfffd, 0x0020)
447 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
448 0xfffd, 0x0020, 0xfffd, 0x0020)
449 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
450 0xfffd, 0x0020, 0xfffd, 0x0020)
451 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
452 0xfffd, 0x0020, 0xfffd, 0x0020),
453 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
454 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
455
456 // Start bytes of 4-byte sequences (0xf0--0xf7).
457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458 ConvertUTFResultContainer(sourceIllegal)
459 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
460 0xfffd, 0xfffd, 0xfffd, 0xfffd),
461 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
462
463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
464 ConvertUTFResultContainer(sourceIllegal)
465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466 0xfffd, 0x0020, 0xfffd, 0x0020)
467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468 0xfffd, 0x0020, 0xfffd, 0x0020),
469 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
470
471 // Start bytes of 5-byte sequences (0xf8--0xfb).
472 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
473 ConvertUTFResultContainer(sourceIllegal)
474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
475 "\xf8\xf9\xfa\xfb"));
476
477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478 ConvertUTFResultContainer(sourceIllegal)
479 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480 0xfffd, 0x0020, 0xfffd, 0x0020),
481 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
482
483 // Start bytes of 6-byte sequences (0xfc--0xfd).
484 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
485 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
486 "\xfc\xfd"));
487
488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
489 ConvertUTFResultContainer(sourceIllegal)
490 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
491 "\xfc\x20\xfd\x20"));
492
493 //
494 // Other bytes (0xc0--0xc1, 0xfe--0xff).
495 //
496
497 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
498 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
499 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
500 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
502 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
504 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
505
506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507 ConvertUTFResultContainer(sourceIllegal)
508 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
509 "\xc0\xc1\xfe\xff"));
510
511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512 ConvertUTFResultContainer(sourceIllegal)
513 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
514 "\xfe\xfe\xff\xff"));
515
516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
517 ConvertUTFResultContainer(sourceIllegal)
518 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519 "\xfe\x80\x80\x80\x80\x80"));
520
521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
522 ConvertUTFResultContainer(sourceIllegal)
523 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
524 "\xff\x80\x80\x80\x80\x80"));
525
526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
527 ConvertUTFResultContainer(sourceIllegal)
528 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
529 0xfffd, 0x0020, 0xfffd, 0x0020),
530 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
531
532 //
533 // Sequences with one continuation byte missing
534 //
535
536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
539 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
542 "\xe0\xa0"));
543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
545 "\xe0\xbf"));
546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
548 "\xe1\x80"));
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
551 "\xec\xbf"));
552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
554 "\xed\x80"));
555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
557 "\xed\x9f"));
558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
560 "\xee\x80"));
561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
562 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
563 "\xef\xbf"));
564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
566 "\xf0\x90\x80"));
567 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
568 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
569 "\xf0\xbf\xbf"));
570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
572 "\xf1\x80\x80"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
575 "\xf3\xbf\xbf"));
576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
577 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
578 "\xf4\x80\x80"));
579 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
580 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
581 "\xf4\x8f\xbf"));
582
583 // Overlong sequences with one trailing byte missing.
584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586 "\xc0"));
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589 "\xc1"));
590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
592 "\xe0\x80"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
595 "\xe0\x9f"));
596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597 ConvertUTFResultContainer(sourceIllegal)
598 .withScalars(0xfffd, 0xfffd, 0xfffd),
599 "\xf0\x80\x80"));
600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601 ConvertUTFResultContainer(sourceIllegal)
602 .withScalars(0xfffd, 0xfffd, 0xfffd),
603 "\xf0\x8f\x80"));
604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605 ConvertUTFResultContainer(sourceIllegal)
606 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
607 "\xf8\x80\x80\x80"));
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal)
610 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
611 "\xfc\x80\x80\x80\x80"));
612
613 // Sequences that represent surrogates with one trailing byte missing.
614 // High surrogates
615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
617 "\xed\xa0"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
620 "\xed\xac"));
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
623 "\xed\xaf"));
624 // Low surrogates
625 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
626 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
627 "\xed\xb0"));
628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
629 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
630 "\xed\xb4"));
631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
633 "\xed\xbf"));
634
635 // Ill-formed 4-byte sequences.
636 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
637 // U+1100xx (invalid)
638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639 ConvertUTFResultContainer(sourceIllegal)
640 .withScalars(0xfffd, 0xfffd, 0xfffd),
641 "\xf4\x90\x80"));
642 // U+13FBxx (invalid)
643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644 ConvertUTFResultContainer(sourceIllegal)
645 .withScalars(0xfffd, 0xfffd, 0xfffd),
646 "\xf4\xbf\xbf"));
647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648 ConvertUTFResultContainer(sourceIllegal)
649 .withScalars(0xfffd, 0xfffd, 0xfffd),
650 "\xf5\x80\x80"));
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal)
653 .withScalars(0xfffd, 0xfffd, 0xfffd),
654 "\xf6\x80\x80"));
655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656 ConvertUTFResultContainer(sourceIllegal)
657 .withScalars(0xfffd, 0xfffd, 0xfffd),
658 "\xf7\x80\x80"));
659 // U+1FFBxx (invalid)
660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
661 ConvertUTFResultContainer(sourceIllegal)
662 .withScalars(0xfffd, 0xfffd, 0xfffd),
663 "\xf7\xbf\xbf"));
664
665 // Ill-formed 5-byte sequences.
666 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
667 // U+2000xx (invalid)
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal)
670 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
671 "\xf8\x88\x80\x80"));
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal)
674 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
675 "\xf8\xbf\xbf\xbf"));
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal)
678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679 "\xf9\x80\x80\x80"));
680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681 ConvertUTFResultContainer(sourceIllegal)
682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
683 "\xfa\x80\x80\x80"));
684 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
685 ConvertUTFResultContainer(sourceIllegal)
686 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
687 "\xfb\x80\x80\x80"));
688 // U+3FFFFxx (invalid)
689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690 ConvertUTFResultContainer(sourceIllegal)
691 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
692 "\xfb\xbf\xbf\xbf"));
693
694 // Ill-formed 6-byte sequences.
695 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
696 // U+40000xx (invalid)
697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698 ConvertUTFResultContainer(sourceIllegal)
699 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
700 "\xfc\x84\x80\x80\x80"));
701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
702 ConvertUTFResultContainer(sourceIllegal)
703 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
704 "\xfc\xbf\xbf\xbf\xbf"));
705 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
706 ConvertUTFResultContainer(sourceIllegal)
707 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
708 "\xfd\x80\x80\x80\x80"));
709 // U+7FFFFFxx (invalid)
710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711 ConvertUTFResultContainer(sourceIllegal)
712 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
713 "\xfd\xbf\xbf\xbf\xbf"));
714
715 //
716 // Sequences with two continuation bytes missing
717 //
718
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
721 "\xf0\x90"));
722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
724 "\xf0\xbf"));
725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
726 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
727 "\xf1\x80"));
728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
730 "\xf3\xbf"));
731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
733 "\xf4\x80"));
734 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
735 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
736 "\xf4\x8f"));
737
738 // Overlong sequences with two trailing byte missing.
739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
741 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
742 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
743 "\xf0\x80"));
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
746 "\xf0\x8f"));
747 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748 ConvertUTFResultContainer(sourceIllegal)
749 .withScalars(0xfffd, 0xfffd, 0xfffd),
750 "\xf8\x80\x80"));
751 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752 ConvertUTFResultContainer(sourceIllegal)
753 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754 "\xfc\x80\x80\x80"));
755
756 // Sequences that represent surrogates with two trailing bytes missing.
757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
759
760 // Ill-formed 4-byte sequences.
761 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
762 // U+110yxx (invalid)
763 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
764 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
765 "\xf4\x90"));
766 // U+13Fyxx (invalid)
767 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
769 "\xf4\xbf"));
770 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
772 "\xf5\x80"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
775 "\xf6\x80"));
776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
778 "\xf7\x80"));
779 // U+1FFyxx (invalid)
780 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
781 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
782 "\xf7\xbf"));
783
784 // Ill-formed 5-byte sequences.
785 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
786 // U+200yxx (invalid)
787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
789 "\xf8\x88\x80"));
790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
792 "\xf8\xbf\xbf"));
793 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
795 "\xf9\x80\x80"));
796 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
798 "\xfa\x80\x80"));
799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
801 "\xfb\x80\x80"));
802 // U+3FFFyxx (invalid)
803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
805 "\xfb\xbf\xbf"));
806
807 // Ill-formed 6-byte sequences.
808 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
809 // U+4000yxx (invalid)
810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
812 "\xfc\x84\x80\x80"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
815 "\xfc\xbf\xbf\xbf"));
816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
818 "\xfd\x80\x80\x80"));
819 // U+7FFFFyxx (invalid)
820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
822 "\xfd\xbf\xbf\xbf"));
823
824 //
825 // Sequences with three continuation bytes missing
826 //
827
828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
832 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
833 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
836 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
837 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
838
839 // Broken overlong sequences.
840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844 "\xf8\x80"));
845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
847 "\xfc\x80\x80"));
848
849 // Ill-formed 4-byte sequences.
850 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
851 // U+14yyxx (invalid)
852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
854 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
856 // U+1Cyyxx (invalid)
857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
859
860 // Ill-formed 5-byte sequences.
861 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
862 // U+20yyxx (invalid)
863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
865 "\xf8\x88"));
866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
868 "\xf8\xbf"));
869 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
870 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
871 "\xf9\x80"));
872 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
873 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
874 "\xfa\x80"));
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
877 "\xfb\x80"));
878 // U+3FCyyxx (invalid)
879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
881 "\xfb\xbf"));
882
883 // Ill-formed 6-byte sequences.
884 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
885 // U+400yyxx (invalid)
886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
888 "\xfc\x84\x80"));
889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
891 "\xfc\xbf\xbf"));
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894 "\xfd\x80\x80"));
895 // U+7FFCyyxx (invalid)
896 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
897 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
898 "\xfd\xbf\xbf"));
899
900 //
901 // Sequences with four continuation bytes missing
902 //
903
904 // Ill-formed 5-byte sequences.
905 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
906 // U+uzyyxx (invalid)
907 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
909 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
910 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
915 // U+3zyyxx (invalid)
916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
918
919 // Broken overlong sequences.
920 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924 "\xfc\x80"));
925
926 // Ill-formed 6-byte sequences.
927 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
928 // U+uzzyyxx (invalid)
929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
931 "\xfc\x84"));
932 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
933 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
934 "\xfc\xbf"));
935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937 "\xfd\x80"));
938 // U+7Fzzyyxx (invalid)
939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
941 "\xfd\xbf"));
942
943 //
944 // Sequences with five continuation bytes missing
945 //
946
947 // Ill-formed 6-byte sequences.
948 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
949 // U+uzzyyxx (invalid)
950 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
952 // U+uuzzyyxx (invalid)
953 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
955
956 //
957 // Consecutive sequences with trailing bytes missing
958 //
959
960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961 ConvertUTFResultContainer(sourceIllegal)
962 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
963 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
964 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
965 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
966 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
967 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
968 "\xc0" "\xe0\x80" "\xf0\x80\x80"
969 "\xf8\x80\x80\x80"
970 "\xfc\x80\x80\x80\x80"
971 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
972 "\xfb\xbf\xbf\xbf"
973 "\xfd\xbf\xbf\xbf\xbf"));
974
975 //
976 // Overlong UTF-8 sequences
977 //
978
979 // U+002F SOLIDUS
980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
982
983 // Overlong sequences of the above.
984 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
986 "\xc0\xaf"));
987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988 ConvertUTFResultContainer(sourceIllegal)
989 .withScalars(0xfffd, 0xfffd, 0xfffd),
990 "\xe0\x80\xaf"));
991 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992 ConvertUTFResultContainer(sourceIllegal)
993 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
994 "\xf0\x80\x80\xaf"));
995 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
996 ConvertUTFResultContainer(sourceIllegal)
997 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
998 "\xf8\x80\x80\x80\xaf"));
999 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1000 ConvertUTFResultContainer(sourceIllegal)
1001 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1002 "\xfc\x80\x80\x80\x80\xaf"));
1003
1004 // U+0000 NULL
1005 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1007 StringRef("\x00", 1)));
1008
1009 // Overlong sequences of the above.
1010 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012 "\xc0\x80"));
1013 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014 ConvertUTFResultContainer(sourceIllegal)
1015 .withScalars(0xfffd, 0xfffd, 0xfffd),
1016 "\xe0\x80\x80"));
1017 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1018 ConvertUTFResultContainer(sourceIllegal)
1019 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1020 "\xf0\x80\x80\x80"));
1021 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022 ConvertUTFResultContainer(sourceIllegal)
1023 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1024 "\xf8\x80\x80\x80\x80"));
1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026 ConvertUTFResultContainer(sourceIllegal)
1027 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1028 "\xfc\x80\x80\x80\x80\x80"));
1029
1030 // Other overlong sequences.
1031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033 "\xc0\xbf"));
1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1036 "\xc1\x80"));
1037 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1038 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1039 "\xc1\xbf"));
1040 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1041 ConvertUTFResultContainer(sourceIllegal)
1042 .withScalars(0xfffd, 0xfffd, 0xfffd),
1043 "\xe0\x9f\xbf"));
1044 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1045 ConvertUTFResultContainer(sourceIllegal)
1046 .withScalars(0xfffd, 0xfffd, 0xfffd),
1047 "\xed\xa0\x80"));
1048 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049 ConvertUTFResultContainer(sourceIllegal)
1050 .withScalars(0xfffd, 0xfffd, 0xfffd),
1051 "\xed\xbf\xbf"));
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(sourceIllegal)
1054 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1055 "\xf0\x8f\x80\x80"));
1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057 ConvertUTFResultContainer(sourceIllegal)
1058 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1059 "\xf0\x8f\xbf\xbf"));
1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061 ConvertUTFResultContainer(sourceIllegal)
1062 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1063 "\xf8\x87\xbf\xbf\xbf"));
1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065 ConvertUTFResultContainer(sourceIllegal)
1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067 "\xfc\x83\xbf\xbf\xbf\xbf"));
1068
1069 //
1070 // Isolated surrogates
1071 //
1072
1073 // Unicode 6.3.0:
1074 //
1075 // D71. High-surrogate code point: A Unicode code point in the range
1076 // U+D800 to U+DBFF.
1077 //
1078 // D73. Low-surrogate code point: A Unicode code point in the range
1079 // U+DC00 to U+DFFF.
1080
1081 // Note: U+E0100 is <DB40 DD00> in UTF16.
1082
1083 // High surrogates
1084
1085 // U+D800
1086 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1087 ConvertUTFResultContainer(sourceIllegal)
1088 .withScalars(0xfffd, 0xfffd, 0xfffd),
1089 "\xed\xa0\x80"));
1090
1091 // U+DB40
1092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093 ConvertUTFResultContainer(sourceIllegal)
1094 .withScalars(0xfffd, 0xfffd, 0xfffd),
1095 "\xed\xac\xa0"));
1096
1097 // U+DBFF
1098 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1099 ConvertUTFResultContainer(sourceIllegal)
1100 .withScalars(0xfffd, 0xfffd, 0xfffd),
1101 "\xed\xaf\xbf"));
1102
1103 // Low surrogates
1104
1105 // U+DC00
1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal)
1108 .withScalars(0xfffd, 0xfffd, 0xfffd),
1109 "\xed\xb0\x80"));
1110
1111 // U+DD00
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1115 "\xed\xb4\x80"));
1116
1117 // U+DFFF
1118 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1119 ConvertUTFResultContainer(sourceIllegal)
1120 .withScalars(0xfffd, 0xfffd, 0xfffd),
1121 "\xed\xbf\xbf"));
1122
1123 // Surrogate pairs
1124
1125 // U+D800 U+DC00
1126 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127 ConvertUTFResultContainer(sourceIllegal)
1128 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1129 "\xed\xa0\x80\xed\xb0\x80"));
1130
1131 // U+D800 U+DD00
1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135 "\xed\xa0\x80\xed\xb4\x80"));
1136
1137 // U+D800 U+DFFF
1138 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1139 ConvertUTFResultContainer(sourceIllegal)
1140 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1141 "\xed\xa0\x80\xed\xbf\xbf"));
1142
1143 // U+DB40 U+DC00
1144 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1145 ConvertUTFResultContainer(sourceIllegal)
1146 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1147 "\xed\xac\xa0\xed\xb0\x80"));
1148
1149 // U+DB40 U+DD00
1150 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1151 ConvertUTFResultContainer(sourceIllegal)
1152 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1153 "\xed\xac\xa0\xed\xb4\x80"));
1154
1155 // U+DB40 U+DFFF
1156 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1157 ConvertUTFResultContainer(sourceIllegal)
1158 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1159 "\xed\xac\xa0\xed\xbf\xbf"));
1160
1161 // U+DBFF U+DC00
1162 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1163 ConvertUTFResultContainer(sourceIllegal)
1164 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1165 "\xed\xaf\xbf\xed\xb0\x80"));
1166
1167 // U+DBFF U+DD00
1168 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1169 ConvertUTFResultContainer(sourceIllegal)
1170 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1171 "\xed\xaf\xbf\xed\xb4\x80"));
1172
1173 // U+DBFF U+DFFF
1174 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175 ConvertUTFResultContainer(sourceIllegal)
1176 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177 "\xed\xaf\xbf\xed\xbf\xbf"));
1178
1179 //
1180 // Noncharacters
1181 //
1182
1183 // Unicode 6.3.0:
1184 //
1185 // D14. Noncharacter: A code point that is permanently reserved for
1186 // internal use and that should never be interchanged. Noncharacters
1187 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1188 // and the values U+FDD0..U+FDEF.
1189
1190 // U+FFFE
1191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1193 "\xef\xbf\xbe"));
1194
1195 // U+FFFF
1196 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1197 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1198 "\xef\xbf\xbf"));
1199
1200 // U+1FFFE
1201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1202 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1203 "\xf0\x9f\xbf\xbe"));
1204
1205 // U+1FFFF
1206 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1207 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1208 "\xf0\x9f\xbf\xbf"));
1209
1210 // U+2FFFE
1211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1212 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1213 "\xf0\xaf\xbf\xbe"));
1214
1215 // U+2FFFF
1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1218 "\xf0\xaf\xbf\xbf"));
1219
1220 // U+3FFFE
1221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1223 "\xf0\xbf\xbf\xbe"));
1224
1225 // U+3FFFF
1226 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1227 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1228 "\xf0\xbf\xbf\xbf"));
1229
1230 // U+4FFFE
1231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1232 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1233 "\xf1\x8f\xbf\xbe"));
1234
1235 // U+4FFFF
1236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1237 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1238 "\xf1\x8f\xbf\xbf"));
1239
1240 // U+5FFFE
1241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1242 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1243 "\xf1\x9f\xbf\xbe"));
1244
1245 // U+5FFFF
1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1248 "\xf1\x9f\xbf\xbf"));
1249
1250 // U+6FFFE
1251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1252 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1253 "\xf1\xaf\xbf\xbe"));
1254
1255 // U+6FFFF
1256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1257 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1258 "\xf1\xaf\xbf\xbf"));
1259
1260 // U+7FFFE
1261 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1262 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1263 "\xf1\xbf\xbf\xbe"));
1264
1265 // U+7FFFF
1266 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1267 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1268 "\xf1\xbf\xbf\xbf"));
1269
1270 // U+8FFFE
1271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1272 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1273 "\xf2\x8f\xbf\xbe"));
1274
1275 // U+8FFFF
1276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1277 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1278 "\xf2\x8f\xbf\xbf"));
1279
1280 // U+9FFFE
1281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1282 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1283 "\xf2\x9f\xbf\xbe"));
1284
1285 // U+9FFFF
1286 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1287 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1288 "\xf2\x9f\xbf\xbf"));
1289
1290 // U+AFFFE
1291 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1292 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1293 "\xf2\xaf\xbf\xbe"));
1294
1295 // U+AFFFF
1296 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1297 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1298 "\xf2\xaf\xbf\xbf"));
1299
1300 // U+BFFFE
1301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1302 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1303 "\xf2\xbf\xbf\xbe"));
1304
1305 // U+BFFFF
1306 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1307 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1308 "\xf2\xbf\xbf\xbf"));
1309
1310 // U+CFFFE
1311 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1312 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1313 "\xf3\x8f\xbf\xbe"));
1314
1315 // U+CFFFF
1316 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1317 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1318 "\xf3\x8f\xbf\xbf"));
1319
1320 // U+DFFFE
1321 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1322 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1323 "\xf3\x9f\xbf\xbe"));
1324
1325 // U+DFFFF
1326 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1327 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1328 "\xf3\x9f\xbf\xbf"));
1329
1330 // U+EFFFE
1331 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1332 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1333 "\xf3\xaf\xbf\xbe"));
1334
1335 // U+EFFFF
1336 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1337 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1338 "\xf3\xaf\xbf\xbf"));
1339
1340 // U+FFFFE
1341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1342 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1343 "\xf3\xbf\xbf\xbe"));
1344
1345 // U+FFFFF
1346 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1347 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1348 "\xf3\xbf\xbf\xbf"));
1349
1350 // U+10FFFE
1351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1352 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1353 "\xf4\x8f\xbf\xbe"));
1354
1355 // U+10FFFF
1356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1357 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1358 "\xf4\x8f\xbf\xbf"));
1359
1360 // U+FDD0
1361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1362 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1363 "\xef\xb7\x90"));
1364
1365 // U+FDD1
1366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1367 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1368 "\xef\xb7\x91"));
1369
1370 // U+FDD2
1371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1372 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1373 "\xef\xb7\x92"));
1374
1375 // U+FDD3
1376 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1377 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1378 "\xef\xb7\x93"));
1379
1380 // U+FDD4
1381 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1382 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1383 "\xef\xb7\x94"));
1384
1385 // U+FDD5
1386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1387 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1388 "\xef\xb7\x95"));
1389
1390 // U+FDD6
1391 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1392 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1393 "\xef\xb7\x96"));
1394
1395 // U+FDD7
1396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1397 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1398 "\xef\xb7\x97"));
1399
1400 // U+FDD8
1401 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1402 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1403 "\xef\xb7\x98"));
1404
1405 // U+FDD9
1406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1407 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1408 "\xef\xb7\x99"));
1409
1410 // U+FDDA
1411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1412 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1413 "\xef\xb7\x9a"));
1414
1415 // U+FDDB
1416 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1417 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1418 "\xef\xb7\x9b"));
1419
1420 // U+FDDC
1421 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1422 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1423 "\xef\xb7\x9c"));
1424
1425 // U+FDDD
1426 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1427 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1428 "\xef\xb7\x9d"));
1429
1430 // U+FDDE
1431 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1432 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1433 "\xef\xb7\x9e"));
1434
1435 // U+FDDF
1436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1437 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1438 "\xef\xb7\x9f"));
1439
1440 // U+FDE0
1441 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1442 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1443 "\xef\xb7\xa0"));
1444
1445 // U+FDE1
1446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1447 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1448 "\xef\xb7\xa1"));
1449
1450 // U+FDE2
1451 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1452 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1453 "\xef\xb7\xa2"));
1454
1455 // U+FDE3
1456 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1457 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1458 "\xef\xb7\xa3"));
1459
1460 // U+FDE4
1461 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1462 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1463 "\xef\xb7\xa4"));
1464
1465 // U+FDE5
1466 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1467 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1468 "\xef\xb7\xa5"));
1469
1470 // U+FDE6
1471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1472 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1473 "\xef\xb7\xa6"));
1474
1475 // U+FDE7
1476 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1477 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1478 "\xef\xb7\xa7"));
1479
1480 // U+FDE8
1481 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1482 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1483 "\xef\xb7\xa8"));
1484
1485 // U+FDE9
1486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1487 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1488 "\xef\xb7\xa9"));
1489
1490 // U+FDEA
1491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1492 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1493 "\xef\xb7\xaa"));
1494
1495 // U+FDEB
1496 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1497 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1498 "\xef\xb7\xab"));
1499
1500 // U+FDEC
1501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1502 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1503 "\xef\xb7\xac"));
1504
1505 // U+FDED
1506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1507 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1508 "\xef\xb7\xad"));
1509
1510 // U+FDEE
1511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1512 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1513 "\xef\xb7\xae"));
1514
1515 // U+FDEF
1516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1517 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1518 "\xef\xb7\xaf"));
1519
1520 // U+FDF0
1521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1522 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1523 "\xef\xb7\xb0"));
1524
1525 // U+FDF1
1526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1527 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1528 "\xef\xb7\xb1"));
1529
1530 // U+FDF2
1531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1532 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1533 "\xef\xb7\xb2"));
1534
1535 // U+FDF3
1536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1537 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1538 "\xef\xb7\xb3"));
1539
1540 // U+FDF4
1541 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1542 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1543 "\xef\xb7\xb4"));
1544
1545 // U+FDF5
1546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1547 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1548 "\xef\xb7\xb5"));
1549
1550 // U+FDF6
1551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1552 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1553 "\xef\xb7\xb6"));
1554
1555 // U+FDF7
1556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1557 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1558 "\xef\xb7\xb7"));
1559
1560 // U+FDF8
1561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1562 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1563 "\xef\xb7\xb8"));
1564
1565 // U+FDF9
1566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1567 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1568 "\xef\xb7\xb9"));
1569
1570 // U+FDFA
1571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1572 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1573 "\xef\xb7\xba"));
1574
1575 // U+FDFB
1576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1577 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1578 "\xef\xb7\xbb"));
1579
1580 // U+FDFC
1581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1582 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1583 "\xef\xb7\xbc"));
1584
1585 // U+FDFD
1586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1587 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1588 "\xef\xb7\xbd"));
1589
1590 // U+FDFE
1591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1592 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1593 "\xef\xb7\xbe"));
1594
1595 // U+FDFF
1596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1597 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1598 "\xef\xb7\xbf"));
1599 }
1600
1601 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1602 // U+0041 LATIN CAPITAL LETTER A
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1605 "\x41", true));
1606
1607 //
1608 // Sequences with one continuation byte missing
1609 //
1610
1611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1612 ConvertUTFResultContainer(sourceExhausted),
1613 "\xc2", true));
1614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615 ConvertUTFResultContainer(sourceExhausted),
1616 "\xdf", true));
1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618 ConvertUTFResultContainer(sourceExhausted),
1619 "\xe0\xa0", true));
1620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1621 ConvertUTFResultContainer(sourceExhausted),
1622 "\xe0\xbf", true));
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(sourceExhausted),
1625 "\xe1\x80", true));
1626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1627 ConvertUTFResultContainer(sourceExhausted),
1628 "\xec\xbf", true));
1629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630 ConvertUTFResultContainer(sourceExhausted),
1631 "\xed\x80", true));
1632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633 ConvertUTFResultContainer(sourceExhausted),
1634 "\xed\x9f", true));
1635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1636 ConvertUTFResultContainer(sourceExhausted),
1637 "\xee\x80", true));
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(sourceExhausted),
1640 "\xef\xbf", true));
1641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1642 ConvertUTFResultContainer(sourceExhausted),
1643 "\xf0\x90\x80", true));
1644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645 ConvertUTFResultContainer(sourceExhausted),
1646 "\xf0\xbf\xbf", true));
1647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648 ConvertUTFResultContainer(sourceExhausted),
1649 "\xf1\x80\x80", true));
1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651 ConvertUTFResultContainer(sourceExhausted),
1652 "\xf3\xbf\xbf", true));
1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654 ConvertUTFResultContainer(sourceExhausted),
1655 "\xf4\x80\x80", true));
1656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1657 ConvertUTFResultContainer(sourceExhausted),
1658 "\xf4\x8f\xbf", true));
1659
1660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1661 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1662 "\x41\xc2", true));
1663 }
1664