ceph/src/boost/libs/detail/test/test_utf8_codecvt.cpp

   1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
   2 // test_utf8_codecvt.cpp
   3
   4 // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
   5 // Use, modification and distribution is subject to the Boost Software
   6 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
   7 // http://www.boost.org/LICENSE_1_0.txt)
   8
   9 #include <algorithm> // std::copy
  10 #include <fstream>
  11 #include <iostream>
  12 #include <iterator>
  13 #include <locale>
  14 #include <vector>
  15 #include <string>
  16
  17 #include <cstddef> // size_t
  18 #include <cwchar>
  19 #include <boost/config.hpp>
  20 #include <boost/core/no_exceptions_support.hpp>
  21
  22 #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
  23 #define BOOST_UTF8_END_NAMESPACE } }
  24 #include <boost/detail/utf8_codecvt_facet.hpp>
  25 #include <boost/detail/utf8_codecvt_facet.ipp>
  26
  27 #if defined(BOOST_NO_STDC_NAMESPACE)
  28 namespace std{
  29     using ::size_t;
  30     using ::wcslen;
  31 #if !defined(UNDER_CE) && !defined(__PGIC__)
  32     using ::w_int;
  33 #endif
  34 } // namespace std
  35 #endif
  36
  37 // Note: copied from boost/iostreams/char_traits.hpp
  38 //
  39 // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
  40 // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
  41 // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
  42 // NOTE: Use BOOST_WORKAROUND?
  43 #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB))  \
  44     || defined(__SUNPRO_CC)
  45     using ::std::wint_t;
  46 #endif
  47
  48 #include <boost/core/lightweight_test.hpp>
  49
  50 template<std::size_t s>
  51 struct test_data
  52 {
  53     static unsigned char utf8_encoding[];
  54     static wchar_t wchar_encoding[];
  55 };
  56
  57 template<>
  58 unsigned char test_data<2>::utf8_encoding[] = {
  59     0x01,
  60     0x7f,
  61     0xc2, 0x80,
  62     0xdf, 0xbf,
  63     0xe0, 0xa0, 0x80,
  64     0xe7, 0xbf, 0xbf
  65 };
  66
  67 template<>
  68 wchar_t test_data<2>::wchar_encoding[] = {
  69     0x0001,
  70     0x007f,
  71     0x0080,
  72     0x07ff,
  73     0x0800,
  74     0x7fff
  75 };
  76
  77 template<>
  78 unsigned char test_data<4>::utf8_encoding[] = {
  79     0x01,
  80     0x7f,
  81     0xc2, 0x80,
  82     0xdf, 0xbf,
  83     0xe0, 0xa0, 0x80,
  84     0xef, 0xbf, 0xbf,
  85     0xf0, 0x90, 0x80, 0x80,
  86     0xf4, 0x8f, 0xbf, 0xbf,
  87     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
  88      * return eof accordlingly.  So don't test the whole 32 range
  89      */
  90     /*
  91     0xf7, 0xbf, 0xbf, 0xbf,
  92     0xf8, 0x88, 0x80, 0x80, 0x80,
  93     0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
  94     0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
  95     0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
  96     */
  97 };
  98
  99 template<>
 100 wchar_t test_data<4>::wchar_encoding[] = {
 101     (wchar_t)0x00000001,
 102     (wchar_t)0x0000007f,
 103     (wchar_t)0x00000080,
 104     (wchar_t)0x000007ff,
 105     (wchar_t)0x00000800,
 106     (wchar_t)0x0000ffff,
 107     (wchar_t)0x00010000,
 108     (wchar_t)0x0010ffff,
 109     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
 110      * return eof accordlingly.  So don't test the whole 32 range
 111      */
 112     /*
 113     (wchar_t)0x001fffff,
 114     (wchar_t)0x00200000,
 115     (wchar_t)0x03ffffff,
 116     (wchar_t)0x04000000,
 117     (wchar_t)0x7fffffff
 118     */
 119 };
 120
 121 int
 122 test_main(int /* argc */, char * /* argv */[]) {
 123     std::locale utf8_locale
 124         = std::locale(
 125             std::locale::classic(),
 126             new boost::detail::utf8_codecvt_facet
 127         );
 128
 129     typedef char utf8_t;
 130     // define test data compatible with the wchar_t implementation
 131     // as either ucs-2 or ucs-4 depending on the compiler/library.
 132     typedef test_data<sizeof(wchar_t)> td;
 133
 134     // Send our test UTF-8 data to file
 135     {
 136         std::ofstream ofs;
 137         ofs.open("test.dat");
 138         std::copy(
 139             td::utf8_encoding,
 140             td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
 141             std::ostream_iterator<utf8_t>(ofs)
 142         );
 143     }
 144
 145     // Read the test data back in, converting to UCS-4 on the way in
 146     std::vector<wchar_t> from_file;
 147     {
 148         std::wifstream ifs;
 149         ifs.imbue(utf8_locale);
 150         ifs.open("test.dat");
 151
 152         std::wint_t item = 0;
 153         // note can't use normal vector from iterator constructor because
 154         // dinkumware doesn't have it.
 155         for(;;){
 156             item = ifs.get();
 157             if(item == WEOF)
 158                 break;
 159             //ifs >> item;
 160             //if(ifs.eof())
 161             //    break;
 162             from_file.push_back(item);
 163         }
 164     }
 165
 166     BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
 167
 168     // Send the UCS4_data back out, converting to UTF-8
 169     {
 170         std::wofstream ofs;
 171         ofs.imbue(utf8_locale);
 172         ofs.open("test2.dat");
 173         std::copy(
 174             from_file.begin(),
 175             from_file.end(),
 176             std::ostream_iterator<wchar_t, wchar_t>(ofs)
 177         );
 178     }
 179
 180     // Make sure that both files are the same
 181     {
 182         typedef std::istream_iterator<utf8_t> is_iter;
 183         is_iter end_iter;
 184
 185         std::ifstream ifs1("test.dat");
 186         is_iter it1(ifs1);
 187         std::vector<utf8_t> data1;
 188         std::copy(it1, end_iter, std::back_inserter(data1));
 189
 190         std::ifstream ifs2("test2.dat");
 191         is_iter it2(ifs2);
 192         std::vector<utf8_t> data2;
 193         std::copy(it2, end_iter, std::back_inserter(data2));
 194
 195         BOOST_TEST(data1 == data2);
 196     }
 197
 198     // some libraries have trouble that only shows up with longer strings
 199
 200     const wchar_t * test3_data = L"\
 201     <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
 202     <!DOCTYPE boost_serialization>\
 203     <boost_serialization signature=\"serialization::archive\" version=\"3\">\
 204     <a class_id=\"0\" tracking_level=\"0\">\
 205         <b>1</b>\
 206         <f>96953204</f>\
 207         <g>177129195</g>\
 208         <l>1</l>\
 209         <m>5627</m>\
 210         <n>23010</n>\
 211         <o>7419</o>\
 212         <p>16212</p>\
 213         <q>4086</q>\
 214         <r>2749</r>\
 215         <c>-33</c>\
 216         <s>124</s>\
 217         <t>28</t>\
 218         <u>32225</u>\
 219         <v>17543</v>\
 220         <w>0.84431422</w>\
 221         <x>1.0170664757130923</x>\
 222         <y>tjbx</y>\
 223         <z>cuwjentqpkejp</z>\
 224     </a>\
 225     </boost_serialization>\
 226     ";
 227
 228     // Send the UCS4_data back out, converting to UTF-8
 229     std::size_t l = std::wcslen(test3_data);
 230     {
 231         std::wofstream ofs;
 232         ofs.imbue(utf8_locale);
 233         ofs.open("test3.dat");
 234         std::copy(
 235             test3_data,
 236             test3_data + l,
 237             std::ostream_iterator<wchar_t, wchar_t>(ofs)
 238         );
 239     }
 240
 241     // Make sure that both files are the same
 242     {
 243         std::wifstream ifs;
 244         ifs.imbue(utf8_locale);
 245         ifs.open("test3.dat");
 246         ifs >> std::noskipws;
 247         BOOST_TEST(
 248             std::equal(
 249                 test3_data,
 250                 test3_data + l,
 251                 std::istream_iterator<wchar_t, wchar_t>(ifs)
 252             )
 253         );
 254     }
 255
 256     // Test length calculation
 257     {
 258         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
 259         std::mbstate_t mbs = std::mbstate_t();
 260         const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
 261         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
 262         BOOST_TEST_EQ(utf8_len, res);
 263     }
 264
 265     // Test that length calculation detects character boundaries
 266     {
 267         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
 268         std::mbstate_t mbs = std::mbstate_t();
 269         // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
 270         // This last byte should not be accounted by length().
 271         const int input_len = 5;
 272         const int utf8_len = 4;
 273         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
 274         BOOST_TEST_EQ(utf8_len, res);
 275     }
 276
 277     return EXIT_SUCCESS;
 278 }
 279
 280 int
 281 main(int argc, char * argv[]){
 282
 283     int retval = 1;
 284     BOOST_TRY{
 285         retval = test_main(argc, argv);
 286     }
 287     #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
 288         BOOST_CATCH(const std::exception & e){
 289             BOOST_ERROR(e.what());
 290         }
 291     #endif
 292     BOOST_CATCH(...){
 293         BOOST_ERROR("failed with uncaught exception:");
 294     }
 295     BOOST_CATCH_END
 296
 297     int error_count = boost::report_errors();
 298     if(error_count > 0)
 299         retval = error_count;
 300     return retval;
 301 }
 302