[ceph.git] / ceph / src / boost / boost / json / detail / utf8.hpp

//
// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/boostorg/json
//

#ifndef BOOST_JSON_DETAIL_UTF8_HPP
#define BOOST_JSON_DETAIL_UTF8_HPP

#include <cstddef>
#include <cstring>
#include <cstdint>

BOOST_JSON_NS_BEGIN
namespace detail {

template<int N>
std::uint32_t
load_little_endian(void const* p)
{
    // VFALCO do we need to initialize this to 0?
    std::uint32_t v;
    std::memcpy(&v, p, N);
#ifdef BOOST_JSON_BIG_ENDIAN
    v = ((v & 0xFF000000) >> 24) |
        ((v & 0x00FF0000) >>  8) |
        ((v & 0x0000FF00) <<  8) |
        ((v & 0x000000FF) << 24);
#endif
    return v;
}

inline
uint16_t
classify_utf8(char c)
{
    // 0x000 = invalid
    // 0x102 = 2 bytes, second byte [80, BF]
    // 0x203 = 3 bytes, second byte [A0, BF]
    // 0x303 = 3 bytes, second byte [80, BF]
    // 0x403 = 3 bytes, second byte [80, 9F]
    // 0x504 = 4 bytes, second byte [90, BF]
    // 0x604 = 4 bytes, second byte [80, BF]
    // 0x704 = 4 bytes, second byte [80, 8F]
    static constexpr uint16_t first[128]
    {
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,  
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
                                       
       0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 
       0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 
       0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 
       0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, 
       0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, 
       0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 
    };
    return first[static_cast<unsigned char>(c)];
}

inline
bool
is_valid_utf8(const char* p, uint16_t first)
{
    uint32_t v;
    switch(first >> 8)
    {
    default:
        return false;

    // 2 bytes, second byte [80, BF]
    case 1:
        v = load_little_endian<2>(p);
        return (v & 0xC000) == 0x8000;

    // 3 bytes, second byte [A0, BF]
    case 2:
        v = load_little_endian<3>(p);
        std::memcpy(&v, p, 3);
        return (v & 0xC0E000) == 0x80A000;

    // 3 bytes, second byte [80, BF]
    case 3:
        v = load_little_endian<3>(p);
        return (v & 0xC0C000) == 0x808000;

    // 3 bytes, second byte [80, 9F]
    case 4:
        v = load_little_endian<3>(p);
        return (v & 0xC0E000) == 0x808000;

    // 4 bytes, second byte [90, BF]
    case 5:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;

    // 4 bytes, second byte [80, BF]
    case 6:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0C000) == 0x80808000;

    // 4 bytes, second byte [80, 8F]
    case 7:
        v = load_little_endian<4>(p);
        return (v & 0xC0C0F000) == 0x80808000;
    }
}

class utf8_sequence
{
    char seq_[4];
    uint16_t first_;
    uint8_t size_;

public:
    void
    save(
        const char* p,
        std::size_t remain) noexcept
    {
        first_ = classify_utf8(*p & 0x7F);
        if(remain >= length())
            size_ = length();
        else
            size_ = static_cast<uint8_t>(remain);
        std::memcpy(seq_, p, size_);
    }

    uint8_t 
    length() const noexcept
    {
        return first_ & 0xFF;
    }

    bool
    complete() const noexcept
    {
        return size_ >= length();
    }

    // returns true if complete
    bool
    append(
        const char* p, 
        std::size_t remain) noexcept
    {
        if(BOOST_JSON_UNLIKELY(needed() == 0))
            return true;
        if(BOOST_JSON_LIKELY(remain >= needed()))
        {
            std::memcpy(
                seq_ + size_, p, needed());
            size_ = length();
            return true;
        }
        if(BOOST_JSON_LIKELY(remain > 0))
        {
            std::memcpy(seq_ + size_, p, remain);
            size_ += static_cast<uint8_t>(remain);
        }
        return false;
    }

    const char*
    data() const noexcept
    {
        return seq_;
    }

    uint8_t
    needed() const noexcept
    {
        return length() - size_;
    }

    bool
    valid() const noexcept
    {
        BOOST_ASSERT(size_ >= length());
        return is_valid_utf8(seq_, first_);
    }
};

} // detail
BOOST_JSON_NS_END

#endif
Commit	Line	Data
20effc67 TL	1	//
	2	// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
	3	//
	4	// Distributed under the Boost Software License, Version 1.0. (See accompanying
	5	// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
	6	//
	7	// Official repository: https://github.com/boostorg/json
	8	//
	9
	10	#ifndef BOOST_JSON_DETAIL_UTF8_HPP
	11	#define BOOST_JSON_DETAIL_UTF8_HPP
	12
	13	#include <cstddef>
	14	#include <cstring>
	15	#include <cstdint>
	16
	17	BOOST_JSON_NS_BEGIN
	18	namespace detail {
	19
	20	template<int N>
	21	std::uint32_t
	22	load_little_endian(void const* p)
	23	{
	24	// VFALCO do we need to initialize this to 0?
	25	std::uint32_t v;
	26	std::memcpy(&v, p, N);
	27	#ifdef BOOST_JSON_BIG_ENDIAN
	28	v = ((v & 0xFF000000) >> 24) \|
	29	((v & 0x00FF0000) >> 8) \|
	30	((v & 0x0000FF00) << 8) \|
	31	((v & 0x000000FF) << 24);
	32	#endif
	33	return v;
	34	}
	35
	36	inline
	37	uint16_t
	38	classify_utf8(char c)
	39	{
	40	// 0x000 = invalid
	41	// 0x102 = 2 bytes, second byte [80, BF]
	42	// 0x203 = 3 bytes, second byte [A0, BF]
	43	// 0x303 = 3 bytes, second byte [80, BF]
	44	// 0x403 = 3 bytes, second byte [80, 9F]
	45	// 0x504 = 4 bytes, second byte [90, BF]
	46	// 0x604 = 4 bytes, second byte [80, BF]
	47	// 0x704 = 4 bytes, second byte [80, 8F]
	48	static constexpr uint16_t first[128]
	49	{
	50	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	51	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	52	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	53	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	54	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	55	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	56	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	57	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
	58
	59	0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
	60	0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
	61	0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
	62	0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
	63	0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
	64	0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
65	0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
66	0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
67	};
68	return first[static_cast<unsigned char>(c)];
69	}
70
71	inline
72	bool
73	is_valid_utf8(const char* p, uint16_t first)
74	{
75	uint32_t v;
76	switch(first >> 8)
77	{
78	default:
79	return false;
80
81	// 2 bytes, second byte [80, BF]
82	case 1:
83	v = load_little_endian<2>(p);
84	return (v & 0xC000) == 0x8000;
85
86	// 3 bytes, second byte [A0, BF]
87	case 2:
88	v = load_little_endian<3>(p);
89	std::memcpy(&v, p, 3);
90	return (v & 0xC0E000) == 0x80A000;
91
92	// 3 bytes, second byte [80, BF]
93	case 3:
94	v = load_little_endian<3>(p);
95	return (v & 0xC0C000) == 0x808000;
96
97	// 3 bytes, second byte [80, 9F]
98	case 4:
99	v = load_little_endian<3>(p);
100	return (v & 0xC0E000) == 0x808000;
101
102	// 4 bytes, second byte [90, BF]
103	case 5:
104	v = load_little_endian<4>(p);
105	return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
106
107	// 4 bytes, second byte [80, BF]
108	case 6:
109	v = load_little_endian<4>(p);
110	return (v & 0xC0C0C000) == 0x80808000;
111
112	// 4 bytes, second byte [80, 8F]
113	case 7:
114	v = load_little_endian<4>(p);
115	return (v & 0xC0C0F000) == 0x80808000;
116	}
117	}
118
119	class utf8_sequence
120	{
121	char seq_[4];
122	uint16_t first_;
123	uint8_t size_;
124
125	public:
126	void
127	save(
128	const char* p,
129	std::size_t remain) noexcept
130	{
131	first_ = classify_utf8(*p & 0x7F);
132	if(remain >= length())
133	size_ = length();
134	else
135	size_ = static_cast<uint8_t>(remain);
136	std::memcpy(seq_, p, size_);
137	}
138
139	uint8_t
140	length() const noexcept
141	{
142	return first_ & 0xFF;
143	}
144
145	bool
146	complete() const noexcept
147	{
148	return size_ >= length();
149	}
150
151	// returns true if complete
152	bool
153	append(
154	const char* p,
155	std::size_t remain) noexcept
156	{
157	if(BOOST_JSON_UNLIKELY(needed() == 0))
158	return true;
159	if(BOOST_JSON_LIKELY(remain >= needed()))
160	{
161	std::memcpy(
162	seq_ + size_, p, needed());
163	size_ = length();
164	return true;
165	}
166	if(BOOST_JSON_LIKELY(remain > 0))
167	{
168	std::memcpy(seq_ + size_, p, remain);
169	size_ += static_cast<uint8_t>(remain);
170	}
171	return false;
172	}
173
174	const char*
175	data() const noexcept
176	{
177	return seq_;
178	}
179
180	uint8_t
181	needed() const noexcept
182	{
183	return length() - size_;
184	}
185
186	bool
187	valid() const noexcept
188	{
189	BOOST_ASSERT(size_ >= length());
190	return is_valid_utf8(seq_, first_);
191	}
192	};
193
194	} // detail
195	BOOST_JSON_NS_END
196
197	#endif