]> git.proxmox.com Git - ceph.git/blame - ceph/src/arrow/cpp/src/arrow/util/utf8.cc
import quincy 17.2.0
[ceph.git] / ceph / src / arrow / cpp / src / arrow / util / utf8.cc
CommitLineData
1d09f67e
TL
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cstdint>
19#include <iterator>
20#include <mutex>
21#include <stdexcept>
22#include <utility>
23
24#include "arrow/result.h"
25#include "arrow/util/logging.h"
26#include "arrow/util/utf8.h"
27#include "arrow/vendored/utfcpp/checked.h"
28
29// Can be defined by utfcpp
30#ifdef NOEXCEPT
31#undef NOEXCEPT
32#endif
33
34namespace arrow {
35namespace util {
36namespace internal {
37
38// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
39// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
40
41// clang-format off
42const uint8_t utf8_small_table[] = { // NOLINT
43 // The first part of the table maps bytes to character classes that
44 // to reduce the size of the transition table and create bitmasks.
45 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT
50 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT
51 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT
52 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT
53
54 // The second part is a transition table that maps a combination
55 // of a state of the automaton and a character class to a state.
56 // Character classes are between 0 and 11, states are multiples of 12.
57 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT
58 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT
59 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT
60 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT
61 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT
62};
63// clang-format on
64
65uint16_t utf8_large_table[9 * 256] = {0xffff};
66
67const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
68
69static void InitializeLargeTable() {
70 for (uint32_t state = 0; state < 9; ++state) {
71 for (uint32_t byte = 0; byte < 256; ++byte) {
72 uint32_t byte_class = utf8_small_table[byte];
73 uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12;
74 DCHECK_LT(next_state, 9);
75 utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256);
76 }
77 }
78}
79
80ARROW_EXPORT void CheckUTF8Initialized() {
81 DCHECK_EQ(utf8_large_table[0], 0)
82 << "InitializeUTF8() must be called before calling UTF8 routines";
83}
84
85} // namespace internal
86
87static std::once_flag utf8_initialized;
88
89void InitializeUTF8() {
90 std::call_once(utf8_initialized, internal::InitializeLargeTable);
91}
92
93static const uint8_t kBOM[] = {0xEF, 0xBB, 0xBF};
94
95Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size) {
96 int64_t i;
97 for (i = 0; i < static_cast<int64_t>(sizeof(kBOM)); ++i) {
98 if (size == 0) {
99 if (i == 0) {
100 // Empty string
101 return data;
102 } else {
103 return Status::Invalid("UTF8 string too short (truncated byte order mark?)");
104 }
105 }
106 if (data[i] != kBOM[i]) {
107 // BOM not found
108 return data;
109 }
110 --size;
111 }
112 // BOM found
113 return data + i;
114}
115
116namespace {
117
118// Some platforms (such as old MinGWs) don't have the <codecvt> header,
119// so call into a vendored utf8 implementation instead.
120
121std::wstring UTF8ToWideStringInternal(const std::string& source) {
122 std::wstring ws;
123#if WCHAR_MAX > 0xFFFF
124 ::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws));
125#else
126 ::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws));
127#endif
128 return ws;
129}
130
131std::string WideStringToUTF8Internal(const std::wstring& source) {
132 std::string s;
133#if WCHAR_MAX > 0xFFFF
134 ::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s));
135#else
136 ::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
137#endif
138 return s;
139}
140
141} // namespace
142
143Result<std::wstring> UTF8ToWideString(const std::string& source) {
144 try {
145 return UTF8ToWideStringInternal(source);
146 } catch (std::exception& e) {
147 return Status::Invalid(e.what());
148 }
149}
150
151ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source) {
152 try {
153 return WideStringToUTF8Internal(source);
154 } catch (std::exception& e) {
155 return Status::Invalid(e.what());
156 }
157}
158
159} // namespace util
160} // namespace arrow