/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // test_utf8_codecvt.cpp // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com . // Use, modification and distribution is subject to the Boost Software // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #include // std::copy #include #include #include #include #include #include #include // size_t #include #include #include #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail { #define BOOST_UTF8_END_NAMESPACE } } #include #include #if defined(BOOST_NO_STDC_NAMESPACE) namespace std{ using ::size_t; using ::wcslen; #if !defined(UNDER_CE) && !defined(__PGIC__) using ::w_int; #endif } // namespace std #endif // Note: copied from boost/iostreams/char_traits.hpp // // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope. // NOTE: Use BOOST_WORKAROUND? #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \ || defined(__SUNPRO_CC) using ::std::wint_t; #endif #include template struct test_data { static unsigned char utf8_encoding[]; static wchar_t wchar_encoding[]; }; template<> unsigned char test_data<2>::utf8_encoding[] = { 0x01, 0x7f, 0xc2, 0x80, 0xdf, 0xbf, 0xe0, 0xa0, 0x80, 0xe7, 0xbf, 0xbf }; template<> wchar_t test_data<2>::wchar_encoding[] = { 0x0001, 0x007f, 0x0080, 0x07ff, 0x0800, 0x7fff }; template<> unsigned char test_data<4>::utf8_encoding[] = { 0x01, 0x7f, 0xc2, 0x80, 0xdf, 0xbf, 0xe0, 0xa0, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf, /* codecvt implementations for clang and gcc don't handle more than 21 bits and * return eof accordlingly. So don't test the whole 32 range */ /* 0xf7, 0xbf, 0xbf, 0xbf, 0xf8, 0x88, 0x80, 0x80, 0x80, 0xfb, 0xbf, 0xbf, 0xbf, 0xbf, 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80, 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf */ }; template<> wchar_t test_data<4>::wchar_encoding[] = { (wchar_t)0x00000001, (wchar_t)0x0000007f, (wchar_t)0x00000080, (wchar_t)0x000007ff, (wchar_t)0x00000800, (wchar_t)0x0000ffff, (wchar_t)0x00010000, (wchar_t)0x0010ffff, /* codecvt implementations for clang and gcc don't handle more than 21 bits and * return eof accordlingly. So don't test the whole 32 range */ /* (wchar_t)0x001fffff, (wchar_t)0x00200000, (wchar_t)0x03ffffff, (wchar_t)0x04000000, (wchar_t)0x7fffffff */ }; int test_main(int /* argc */, char * /* argv */[]) { std::locale utf8_locale = std::locale( std::locale::classic(), new boost::detail::utf8_codecvt_facet ); typedef char utf8_t; // define test data compatible with the wchar_t implementation // as either ucs-2 or ucs-4 depending on the compiler/library. typedef test_data td; // Send our test UTF-8 data to file { std::ofstream ofs; ofs.open("test.dat"); std::copy( td::utf8_encoding, td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char), std::ostream_iterator(ofs) ); } // Read the test data back in, converting to UCS-4 on the way in std::vector from_file; { std::wifstream ifs; ifs.imbue(utf8_locale); ifs.open("test.dat"); std::wint_t item = 0; // note can't use normal vector from iterator constructor because // dinkumware doesn't have it. for(;;){ item = ifs.get(); if(item == WEOF) break; //ifs >> item; //if(ifs.eof()) // break; from_file.push_back(item); } } BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding)); // Send the UCS4_data back out, converting to UTF-8 { std::wofstream ofs; ofs.imbue(utf8_locale); ofs.open("test2.dat"); std::copy( from_file.begin(), from_file.end(), std::ostream_iterator(ofs) ); } // Make sure that both files are the same { typedef std::istream_iterator is_iter; is_iter end_iter; std::ifstream ifs1("test.dat"); is_iter it1(ifs1); std::vector data1; std::copy(it1, end_iter, std::back_inserter(data1)); std::ifstream ifs2("test2.dat"); is_iter it2(ifs2); std::vector data2; std::copy(it2, end_iter, std::back_inserter(data2)); BOOST_TEST(data1 == data2); } // some libraries have trouble that only shows up with longer strings const wchar_t * test3_data = L"\ \ \ \ \ 1\ 96953204\ 177129195\ 1\ 5627\ 23010\ 7419\

16212

\ 4086\ 2749\ -33\ 124\ 28\ 32225\ 17543\ 0.84431422\ 1.0170664757130923\ tjbx\ cuwjentqpkejp\
\
\ "; // Send the UCS4_data back out, converting to UTF-8 std::size_t l = std::wcslen(test3_data); { std::wofstream ofs; ofs.imbue(utf8_locale); ofs.open("test3.dat"); std::copy( test3_data, test3_data + l, std::ostream_iterator(ofs) ); } // Make sure that both files are the same { std::wifstream ifs; ifs.imbue(utf8_locale); ifs.open("test3.dat"); ifs >> std::noskipws; BOOST_TEST( std::equal( test3_data, test3_data + l, std::istream_iterator(ifs) ) ); } // Test length calculation { std::codecvt const& fac = std::use_facet< std::codecvt >(utf8_locale); std::mbstate_t mbs = std::mbstate_t(); const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding); int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u)); BOOST_TEST_EQ(utf8_len, res); } // Test that length calculation detects character boundaries { std::codecvt const& fac = std::use_facet< std::codecvt >(utf8_locale); std::mbstate_t mbs = std::mbstate_t(); // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character. // This last byte should not be accounted by length(). const int input_len = 5; const int utf8_len = 4; int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u)); BOOST_TEST_EQ(utf8_len, res); } return EXIT_SUCCESS; } int main(int argc, char * argv[]){ int retval = 1; BOOST_TRY{ retval = test_main(argc, argv); } #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE BOOST_CATCH(const std::exception & e){ BOOST_ERROR(e.what()); } #endif BOOST_CATCH(...){ BOOST_ERROR("failed with uncaught exception:"); } BOOST_CATCH_END int error_count = boost::report_errors(); if(error_count > 0) retval = error_count; return retval; }