ceph/src/boost/libs/detail/test/test_utf8_codecvt.cpp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // test_utf8_codecvt.cpp
   3
   4 // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
   5 // Use, modification and distribution is subject to the Boost Software
   6 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
   7 // http://www.boost.org/LICENSE_1_0.txt)
   8
   9 #include <algorithm> // std::copy
  10 #include <fstream>
  11 #include <iostream>
  12 #include <iterator>
  13 #include <locale>
  14 #include <vector>
  15 #include <string>
  16
  17 #include <cstddef> // size_t
  18 #include <cwchar>
  19 #include <boost/config.hpp>
  20
  21 #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
  22 #define BOOST_UTF8_END_NAMESPACE } }
  23 #include <boost/detail/utf8_codecvt_facet.hpp>
  24 #include <boost/detail/utf8_codecvt_facet.ipp>
  25
  26 #if defined(BOOST_NO_STDC_NAMESPACE)
  27 namespace std{
  28     using ::size_t;
  29     using ::wcslen;
  30 #if !defined(UNDER_CE) && !defined(__PGIC__)
  31     using ::w_int;
  32 #endif
  33 } // namespace std
  34 #endif
  35
  36 // Note: copied from boost/iostreams/char_traits.hpp
  37 //
  38 // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
  39 // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
  40 // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
  41 // NOTE: Use BOOST_WORKAROUND?
  42 #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB))  \
  43     || defined(__SUNPRO_CC)
  44     using ::std::wint_t;
  45 #endif
  46
  47 #include <boost/core/lightweight_test.hpp>
  48
  49 template<std::size_t s>
  50 struct test_data
  51 {
  52     static unsigned char utf8_encoding[];
  53     static wchar_t wchar_encoding[];
  54 };
  55
  56 template<>
  57 unsigned char test_data<2>::utf8_encoding[] = {
  58     0x01,
  59     0x7f,
  60     0xc2, 0x80,
  61     0xdf, 0xbf,
  62     0xe0, 0xa0, 0x80,
  63     0xe7, 0xbf, 0xbf
  64 };
  65
  66 template<>
  67 wchar_t test_data<2>::wchar_encoding[] = {
  68     0x0001,
  69     0x007f,
  70     0x0080,
  71     0x07ff,
  72     0x0800,
  73     0x7fff
  74 };
  75
  76 template<>
  77 unsigned char test_data<4>::utf8_encoding[] = {
  78     0x01,
  79     0x7f,
  80     0xc2, 0x80,
  81     0xdf, 0xbf,
  82     0xe0, 0xa0, 0x80,
  83     0xef, 0xbf, 0xbf,
  84     0xf0, 0x90, 0x80, 0x80,
  85     0xf4, 0x8f, 0xbf, 0xbf,
  86     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
  87      * return eof accordlingly.  So don't test the whole 32 range
  88      */
  89     /*
  90     0xf7, 0xbf, 0xbf, 0xbf,
  91     0xf8, 0x88, 0x80, 0x80, 0x80,
  92     0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
  93     0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
  94     0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
  95     */
  96 };
  97
  98 template<>
  99 wchar_t test_data<4>::wchar_encoding[] = {
 100     (wchar_t)0x00000001,
 101     (wchar_t)0x0000007f,
 102     (wchar_t)0x00000080,
 103     (wchar_t)0x000007ff,
 104     (wchar_t)0x00000800,
 105     (wchar_t)0x0000ffff,
 106     (wchar_t)0x00010000,
 107     (wchar_t)0x0010ffff,
 108     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
 109      * return eof accordlingly.  So don't test the whole 32 range
 110      */
 111     /*
 112     (wchar_t)0x001fffff,
 113     (wchar_t)0x00200000,
 114     (wchar_t)0x03ffffff,
 115     (wchar_t)0x04000000,
 116     (wchar_t)0x7fffffff
 117     */
 118 };
 119
 120 int
 121 test_main(int /* argc */, char * /* argv */[]) {
 122     std::locale utf8_locale
 123         = std::locale(
 124             std::locale::classic(),
 125             new boost::detail::utf8_codecvt_facet
 126         );
 127
 128     typedef char utf8_t;
 129     // define test data compatible with the wchar_t implementation
 130     // as either ucs-2 or ucs-4 depending on the compiler/library.
 131     typedef test_data<sizeof(wchar_t)> td;
 132
 133     // Send our test UTF-8 data to file
 134     {
 135         std::ofstream ofs;
 136         ofs.open("test.dat");
 137         std::copy(
 138             td::utf8_encoding,
 139             td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
 140             std::ostream_iterator<utf8_t>(ofs)
 141         );
 142     }
 143
 144     // Read the test data back in, converting to UCS-4 on the way in
 145     std::vector<wchar_t> from_file;
 146     {
 147         std::wifstream ifs;
 148         ifs.imbue(utf8_locale);
 149         ifs.open("test.dat");
 150
 151         std::wint_t item = 0;
 152         // note can't use normal vector from iterator constructor because
 153         // dinkumware doesn't have it.
 154         for(;;){
 155             item = ifs.get();
 156             if(item == WEOF)
 157                 break;
 158             //ifs >> item;
 159             //if(ifs.eof())
 160             //    break;
 161             from_file.push_back(item);
 162         }
 163     }
 164
 165     BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
 166
 167     // Send the UCS4_data back out, converting to UTF-8
 168     {
 169         std::wofstream ofs;
 170         ofs.imbue(utf8_locale);
 171         ofs.open("test2.dat");
 172         std::copy(
 173             from_file.begin(),
 174             from_file.end(),
 175             std::ostream_iterator<wchar_t, wchar_t>(ofs)
 176         );
 177     }
 178
 179     // Make sure that both files are the same
 180     {
 181         typedef std::istream_iterator<utf8_t> is_iter;
 182         is_iter end_iter;
 183
 184         std::ifstream ifs1("test.dat");
 185         is_iter it1(ifs1);
 186         std::vector<utf8_t> data1;
 187         std::copy(it1, end_iter, std::back_inserter(data1));
 188
 189         std::ifstream ifs2("test2.dat");
 190         is_iter it2(ifs2);
 191         std::vector<utf8_t> data2;
 192         std::copy(it2, end_iter, std::back_inserter(data2));
 193
 194         BOOST_TEST(data1 == data2);
 195     }
 196
 197     // some libraries have trouble that only shows up with longer strings
 198
 199     const wchar_t * test3_data = L"\
 200     <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
 201     <!DOCTYPE boost_serialization>\
 202     <boost_serialization signature=\"serialization::archive\" version=\"3\">\
 203     <a class_id=\"0\" tracking_level=\"0\">\
 204         <b>1</b>\
 205         <f>96953204</f>\
 206         <g>177129195</g>\
 207         <l>1</l>\
 208         <m>5627</m>\
 209         <n>23010</n>\
 210         <o>7419</o>\
 211         <p>16212</p>\
 212         <q>4086</q>\
 213         <r>2749</r>\
 214         <c>-33</c>\
 215         <s>124</s>\
 216         <t>28</t>\
 217         <u>32225</u>\
 218         <v>17543</v>\
 219         <w>0.84431422</w>\
 220         <x>1.0170664757130923</x>\
 221         <y>tjbx</y>\
 222         <z>cuwjentqpkejp</z>\
 223     </a>\
 224     </boost_serialization>\
 225     ";
 226
 227     // Send the UCS4_data back out, converting to UTF-8
 228     std::size_t l = std::wcslen(test3_data);
 229     {
 230         std::wofstream ofs;
 231         ofs.imbue(utf8_locale);
 232         ofs.open("test3.dat");
 233         std::copy(
 234             test3_data,
 235             test3_data + l,
 236             std::ostream_iterator<wchar_t, wchar_t>(ofs)
 237         );
 238     }
 239
 240     // Make sure that both files are the same
 241     {
 242         std::wifstream ifs;
 243         ifs.imbue(utf8_locale);
 244         ifs.open("test3.dat");
 245         ifs >> std::noskipws;
 246         BOOST_TEST(
 247             std::equal(
 248                 test3_data,
 249                 test3_data + l,
 250                 std::istream_iterator<wchar_t, wchar_t>(ifs)
 251             )
 252         );
 253     }
 254
 255     // Test length calculation
 256     {
 257         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
 258         std::mbstate_t mbs = std::mbstate_t();
 259         const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
 260         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
 261         BOOST_TEST_EQ(utf8_len, res);
 262     }
 263
 264     // Test that length calculation detects character boundaries
 265     {
 266         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
 267         std::mbstate_t mbs = std::mbstate_t();
 268         // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
 269         // This last byte should not be accounted by length().
 270         const int input_len = 5;
 271         const int utf8_len = 4;
 272         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
 273         BOOST_TEST_EQ(utf8_len, res);
 274     }
 275
 276     return EXIT_SUCCESS;
 277 }
 278
 279 int
 280 main(int argc, char * argv[]){
 281
 282     int retval = 1;
 283     BOOST_TRY{
 284         retval = test_main(argc, argv);
 285     }
 286     #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
 287         BOOST_CATCH(const std::exception & e){
 288             BOOST_ERROR(e.what());
 289         }
 290     #endif
 291     BOOST_CATCH(...){
 292         BOOST_ERROR("failed with uncaught exception:");
 293     }
 294     BOOST_CATCH_END
 295
 296     int error_count = boost::report_errors();
 297     if(error_count > 0)
 298         retval = error_count;
 299     return retval;
 300 }
 301