test_utf8_codecvt.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  2. // test_utf8_codecvt.cpp
  3. // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
  4. // Use, modification and distribution is subject to the Boost Software
  5. // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. #include <algorithm> // std::copy
  8. #include <fstream>
  9. #include <iostream>
  10. #include <iterator>
  11. #include <locale>
  12. #include <vector>
  13. #include <string>
  14. #include <cstddef> // size_t
  15. #include <cwchar>
  16. #include <boost/config.hpp>
  17. #include <boost/core/no_exceptions_support.hpp>
  18. #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
  19. #define BOOST_UTF8_END_NAMESPACE } }
  20. #include <boost/detail/utf8_codecvt_facet.hpp>
  21. #include <boost/detail/utf8_codecvt_facet.ipp>
  22. #if defined(BOOST_NO_STDC_NAMESPACE)
  23. namespace std{
  24. using ::size_t;
  25. using ::wcslen;
  26. #if !defined(UNDER_CE) && !defined(__PGIC__)
  27. using ::w_int;
  28. #endif
  29. } // namespace std
  30. #endif
  31. // Note: copied from boost/iostreams/char_traits.hpp
  32. //
  33. // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
  34. // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
  35. // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
  36. // NOTE: Use BOOST_WORKAROUND?
  37. #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \
  38. || defined(__SUNPRO_CC)
  39. using ::std::wint_t;
  40. #endif
  41. #include <boost/core/lightweight_test.hpp>
  42. template<std::size_t s>
  43. struct test_data
  44. {
  45. static unsigned char utf8_encoding[];
  46. static wchar_t wchar_encoding[];
  47. };
  48. template<>
  49. unsigned char test_data<2>::utf8_encoding[] = {
  50. 0x01,
  51. 0x7f,
  52. 0xc2, 0x80,
  53. 0xdf, 0xbf,
  54. 0xe0, 0xa0, 0x80,
  55. 0xe7, 0xbf, 0xbf
  56. };
  57. template<>
  58. wchar_t test_data<2>::wchar_encoding[] = {
  59. 0x0001,
  60. 0x007f,
  61. 0x0080,
  62. 0x07ff,
  63. 0x0800,
  64. 0x7fff
  65. };
  66. template<>
  67. unsigned char test_data<4>::utf8_encoding[] = {
  68. 0x01,
  69. 0x7f,
  70. 0xc2, 0x80,
  71. 0xdf, 0xbf,
  72. 0xe0, 0xa0, 0x80,
  73. 0xef, 0xbf, 0xbf,
  74. 0xf0, 0x90, 0x80, 0x80,
  75. 0xf4, 0x8f, 0xbf, 0xbf,
  76. /* codecvt implementations for clang and gcc don't handle more than 21 bits and
  77. * return eof accordlingly. So don't test the whole 32 range
  78. */
  79. /*
  80. 0xf7, 0xbf, 0xbf, 0xbf,
  81. 0xf8, 0x88, 0x80, 0x80, 0x80,
  82. 0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
  83. 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
  84. 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
  85. */
  86. };
  87. template<>
  88. wchar_t test_data<4>::wchar_encoding[] = {
  89. (wchar_t)0x00000001,
  90. (wchar_t)0x0000007f,
  91. (wchar_t)0x00000080,
  92. (wchar_t)0x000007ff,
  93. (wchar_t)0x00000800,
  94. (wchar_t)0x0000ffff,
  95. (wchar_t)0x00010000,
  96. (wchar_t)0x0010ffff,
  97. /* codecvt implementations for clang and gcc don't handle more than 21 bits and
  98. * return eof accordlingly. So don't test the whole 32 range
  99. */
  100. /*
  101. (wchar_t)0x001fffff,
  102. (wchar_t)0x00200000,
  103. (wchar_t)0x03ffffff,
  104. (wchar_t)0x04000000,
  105. (wchar_t)0x7fffffff
  106. */
  107. };
  108. int
  109. test_main(int /* argc */, char * /* argv */[]) {
  110. std::locale utf8_locale
  111. = std::locale(
  112. std::locale::classic(),
  113. new boost::detail::utf8_codecvt_facet
  114. );
  115. typedef char utf8_t;
  116. // define test data compatible with the wchar_t implementation
  117. // as either ucs-2 or ucs-4 depending on the compiler/library.
  118. typedef test_data<sizeof(wchar_t)> td;
  119. // Send our test UTF-8 data to file
  120. {
  121. std::ofstream ofs;
  122. ofs.open("test.dat");
  123. std::copy(
  124. td::utf8_encoding,
  125. td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
  126. std::ostream_iterator<utf8_t>(ofs)
  127. );
  128. }
  129. // Read the test data back in, converting to UCS-4 on the way in
  130. std::vector<wchar_t> from_file;
  131. {
  132. std::wifstream ifs;
  133. ifs.imbue(utf8_locale);
  134. ifs.open("test.dat");
  135. std::wint_t item = 0;
  136. // note can't use normal vector from iterator constructor because
  137. // dinkumware doesn't have it.
  138. for(;;){
  139. item = ifs.get();
  140. if(item == WEOF)
  141. break;
  142. //ifs >> item;
  143. //if(ifs.eof())
  144. // break;
  145. from_file.push_back(item);
  146. }
  147. }
  148. BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
  149. // Send the UCS4_data back out, converting to UTF-8
  150. {
  151. std::wofstream ofs;
  152. ofs.imbue(utf8_locale);
  153. ofs.open("test2.dat");
  154. std::copy(
  155. from_file.begin(),
  156. from_file.end(),
  157. std::ostream_iterator<wchar_t, wchar_t>(ofs)
  158. );
  159. }
  160. // Make sure that both files are the same
  161. {
  162. typedef std::istream_iterator<utf8_t> is_iter;
  163. is_iter end_iter;
  164. std::ifstream ifs1("test.dat");
  165. is_iter it1(ifs1);
  166. std::vector<utf8_t> data1;
  167. std::copy(it1, end_iter, std::back_inserter(data1));
  168. std::ifstream ifs2("test2.dat");
  169. is_iter it2(ifs2);
  170. std::vector<utf8_t> data2;
  171. std::copy(it2, end_iter, std::back_inserter(data2));
  172. BOOST_TEST(data1 == data2);
  173. }
  174. // some libraries have trouble that only shows up with longer strings
  175. const wchar_t * test3_data = L"\
  176. <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
  177. <!DOCTYPE boost_serialization>\
  178. <boost_serialization signature=\"serialization::archive\" version=\"3\">\
  179. <a class_id=\"0\" tracking_level=\"0\">\
  180. <b>1</b>\
  181. <f>96953204</f>\
  182. <g>177129195</g>\
  183. <l>1</l>\
  184. <m>5627</m>\
  185. <n>23010</n>\
  186. <o>7419</o>\
  187. <p>16212</p>\
  188. <q>4086</q>\
  189. <r>2749</r>\
  190. <c>-33</c>\
  191. <s>124</s>\
  192. <t>28</t>\
  193. <u>32225</u>\
  194. <v>17543</v>\
  195. <w>0.84431422</w>\
  196. <x>1.0170664757130923</x>\
  197. <y>tjbx</y>\
  198. <z>cuwjentqpkejp</z>\
  199. </a>\
  200. </boost_serialization>\
  201. ";
  202. // Send the UCS4_data back out, converting to UTF-8
  203. std::size_t l = std::wcslen(test3_data);
  204. {
  205. std::wofstream ofs;
  206. ofs.imbue(utf8_locale);
  207. ofs.open("test3.dat");
  208. std::copy(
  209. test3_data,
  210. test3_data + l,
  211. std::ostream_iterator<wchar_t, wchar_t>(ofs)
  212. );
  213. }
  214. // Make sure that both files are the same
  215. {
  216. std::wifstream ifs;
  217. ifs.imbue(utf8_locale);
  218. ifs.open("test3.dat");
  219. ifs >> std::noskipws;
  220. BOOST_TEST(
  221. std::equal(
  222. test3_data,
  223. test3_data + l,
  224. std::istream_iterator<wchar_t, wchar_t>(ifs)
  225. )
  226. );
  227. }
  228. // Test length calculation
  229. {
  230. std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
  231. std::mbstate_t mbs = std::mbstate_t();
  232. const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
  233. int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
  234. BOOST_TEST_EQ(utf8_len, res);
  235. }
  236. // Test that length calculation detects character boundaries
  237. {
  238. std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
  239. std::mbstate_t mbs = std::mbstate_t();
  240. // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
  241. // This last byte should not be accounted by length().
  242. const int input_len = 5;
  243. const int utf8_len = 4;
  244. int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
  245. BOOST_TEST_EQ(utf8_len, res);
  246. }
  247. return EXIT_SUCCESS;
  248. }
  249. int
  250. main(int argc, char * argv[]){
  251. int retval = 1;
  252. BOOST_TRY{
  253. retval = test_main(argc, argv);
  254. }
  255. #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
  256. BOOST_CATCH(const std::exception & e){
  257. BOOST_ERROR(e.what());
  258. }
  259. #endif
  260. BOOST_CATCH(...){
  261. BOOST_ERROR("failed with uncaught exception:");
  262. }
  263. BOOST_CATCH_END
  264. int error_count = boost::report_errors();
  265. if(error_count > 0)
  266. retval = error_count;
  267. return retval;
  268. }