]>
Commit | Line | Data |
---|---|---|
1d09f67e TL |
1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file | |
3 | // distributed with this work for additional information | |
4 | // regarding copyright ownership. The ASF licenses this file | |
5 | // to you under the Apache License, Version 2.0 (the | |
6 | // "License"); you may not use this file except in compliance | |
7 | // with the License. You may obtain a copy of the License at | |
8 | // | |
9 | // http://www.apache.org/licenses/LICENSE-2.0 | |
10 | // | |
11 | // Unless required by applicable law or agreed to in writing, | |
12 | // software distributed under the License is distributed on an | |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | // KIND, either express or implied. See the License for the | |
15 | // specific language governing permissions and limitations | |
16 | // under the License. | |
17 | ||
18 | #include <cstdint> | |
19 | #include <iterator> | |
20 | #include <mutex> | |
21 | #include <stdexcept> | |
22 | #include <utility> | |
23 | ||
24 | #include "arrow/result.h" | |
25 | #include "arrow/util/logging.h" | |
26 | #include "arrow/util/utf8.h" | |
27 | #include "arrow/vendored/utfcpp/checked.h" | |
28 | ||
29 | // Can be defined by utfcpp | |
30 | #ifdef NOEXCEPT | |
31 | #undef NOEXCEPT | |
32 | #endif | |
33 | ||
34 | namespace arrow { | |
35 | namespace util { | |
36 | namespace internal { | |
37 | ||
38 | // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
39 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
40 | ||
41 | // clang-format off | |
42 | const uint8_t utf8_small_table[] = { // NOLINT | |
43 | // The first part of the table maps bytes to character classes that | |
44 | // to reduce the size of the transition table and create bitmasks. | |
45 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT | |
46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT | |
47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT | |
48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT | |
49 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT | |
50 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT | |
51 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT | |
52 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT | |
53 | ||
54 | // The second part is a transition table that maps a combination | |
55 | // of a state of the automaton and a character class to a state. | |
56 | // Character classes are between 0 and 11, states are multiples of 12. | |
57 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT | |
58 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT | |
59 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT | |
60 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT | |
61 | 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT | |
62 | }; | |
63 | // clang-format on | |
64 | ||
65 | uint16_t utf8_large_table[9 * 256] = {0xffff}; | |
66 | ||
67 | const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; | |
68 | ||
69 | static void InitializeLargeTable() { | |
70 | for (uint32_t state = 0; state < 9; ++state) { | |
71 | for (uint32_t byte = 0; byte < 256; ++byte) { | |
72 | uint32_t byte_class = utf8_small_table[byte]; | |
73 | uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12; | |
74 | DCHECK_LT(next_state, 9); | |
75 | utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256); | |
76 | } | |
77 | } | |
78 | } | |
79 | ||
80 | ARROW_EXPORT void CheckUTF8Initialized() { | |
81 | DCHECK_EQ(utf8_large_table[0], 0) | |
82 | << "InitializeUTF8() must be called before calling UTF8 routines"; | |
83 | } | |
84 | ||
85 | } // namespace internal | |
86 | ||
87 | static std::once_flag utf8_initialized; | |
88 | ||
89 | void InitializeUTF8() { | |
90 | std::call_once(utf8_initialized, internal::InitializeLargeTable); | |
91 | } | |
92 | ||
93 | static const uint8_t kBOM[] = {0xEF, 0xBB, 0xBF}; | |
94 | ||
95 | Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size) { | |
96 | int64_t i; | |
97 | for (i = 0; i < static_cast<int64_t>(sizeof(kBOM)); ++i) { | |
98 | if (size == 0) { | |
99 | if (i == 0) { | |
100 | // Empty string | |
101 | return data; | |
102 | } else { | |
103 | return Status::Invalid("UTF8 string too short (truncated byte order mark?)"); | |
104 | } | |
105 | } | |
106 | if (data[i] != kBOM[i]) { | |
107 | // BOM not found | |
108 | return data; | |
109 | } | |
110 | --size; | |
111 | } | |
112 | // BOM found | |
113 | return data + i; | |
114 | } | |
115 | ||
116 | namespace { | |
117 | ||
118 | // Some platforms (such as old MinGWs) don't have the <codecvt> header, | |
119 | // so call into a vendored utf8 implementation instead. | |
120 | ||
121 | std::wstring UTF8ToWideStringInternal(const std::string& source) { | |
122 | std::wstring ws; | |
123 | #if WCHAR_MAX > 0xFFFF | |
124 | ::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws)); | |
125 | #else | |
126 | ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws)); | |
127 | #endif | |
128 | return ws; | |
129 | } | |
130 | ||
131 | std::string WideStringToUTF8Internal(const std::wstring& source) { | |
132 | std::string s; | |
133 | #if WCHAR_MAX > 0xFFFF | |
134 | ::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s)); | |
135 | #else | |
136 | ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s)); | |
137 | #endif | |
138 | return s; | |
139 | } | |
140 | ||
141 | } // namespace | |
142 | ||
143 | Result<std::wstring> UTF8ToWideString(const std::string& source) { | |
144 | try { | |
145 | return UTF8ToWideStringInternal(source); | |
146 | } catch (std::exception& e) { | |
147 | return Status::Invalid(e.what()); | |
148 | } | |
149 | } | |
150 | ||
151 | ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) { | |
152 | try { | |
153 | return WideStringToUTF8Internal(source); | |
154 | } catch (std::exception& e) { | |
155 | return Status::Invalid(e.what()); | |
156 | } | |
157 | } | |
158 | ||
159 | } // namespace util | |
160 | } // namespace arrow |