performance_utf8_codecvt.cpp 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  2. // test_utf8_codecvt.cpp
  3. // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
  4. // Use, modification and distribution is subject to the Boost Software
  5. // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. #include <algorithm>
  8. #include <fstream>
  9. #include <iostream>
  10. #include <iterator>
  11. #include <locale>
  12. #include <vector>
  13. #include <string>
  14. #include <cstddef> // size_t
  15. #include <boost/config.hpp>
  16. #if defined(BOOST_NO_STDC_NAMESPACE)
  17. namespace std{
  18. using ::size_t;
  19. } // namespace std
  20. #endif
  21. #include <cwchar>
  22. #ifdef BOOST_NO_STDC_NAMESPACE
  23. namespace std{ using ::wcslen; }
  24. #endif
  25. #include "../test/test_tools.hpp"
  26. #include <boost/archive/iterators/istream_iterator.hpp>
  27. #include <boost/archive/iterators/ostream_iterator.hpp>
  28. #include <boost/archive/add_facet.hpp>
  29. #include <boost/archive/detail/utf8_codecvt_facet.hpp>
  30. template<std::size_t s>
  31. struct test_data
  32. {
  33. static unsigned char utf8_encoding[];
  34. static wchar_t wchar_encoding[];
  35. };
  36. template<>
  37. unsigned char test_data<2>::utf8_encoding[] = {
  38. 0x01,
  39. 0x7f,
  40. 0xc2, 0x80,
  41. 0xdf, 0xbf,
  42. 0xe0, 0xa0, 0x80,
  43. 0xe7, 0xbf, 0xbf
  44. };
  45. template<>
  46. wchar_t test_data<2>::wchar_encoding[] = {
  47. 0x0001,
  48. 0x007f,
  49. 0x0080,
  50. 0x07ff,
  51. 0x0800,
  52. 0x7fff
  53. };
  54. template<>
  55. unsigned char test_data<4>::utf8_encoding[] = {
  56. 0x01,
  57. 0x7f,
  58. 0xc2, 0x80,
  59. 0xdf, 0xbf,
  60. 0xe0, 0xa0, 0x80,
  61. 0xef, 0xbf, 0xbf,
  62. 0xf0, 0x90, 0x80, 0x80,
  63. 0xf4, 0x8f, 0xbf, 0xbf,
  64. 0xf7, 0xbf, 0xbf, 0xbf,
  65. 0xf8, 0x88, 0x80, 0x80, 0x80,
  66. 0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
  67. 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
  68. 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
  69. };
  70. template<>
  71. wchar_t test_data<4>::wchar_encoding[] = {
  72. 0x00000001,
  73. 0x0000007f,
  74. 0x00000080,
  75. 0x000007ff,
  76. 0x00000800,
  77. 0x0000ffff,
  78. 0x00010000,
  79. 0x0010ffff,
  80. 0x001fffff,
  81. 0x00200000,
  82. 0x03ffffff,
  83. 0x04000000,
  84. 0x7fffffff
  85. };
  86. int
  87. test_main(int /* argc */, char * /* argv */[]) {
  88. std::locale old_loc;
  89. std::locale * utf8_locale
  90. = boost::archive::add_facet(
  91. old_loc,
  92. new boost::archive::detail::utf8_codecvt_facet
  93. );
  94. typedef char utf8_t;
  95. typedef test_data<sizeof(wchar_t)> td;
  96. // Send our test UTF-8 data to file
  97. {
  98. std::ofstream ofs;
  99. ofs.open("test.dat", std::ios::binary);
  100. std::copy(
  101. td::utf8_encoding,
  102. #if ! defined(__BORLANDC__)
  103. // borland 5.60 complains about this
  104. td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
  105. #else
  106. // so use this instead
  107. td::utf8_encoding + 12,
  108. #endif
  109. boost::archive::iterators::ostream_iterator<utf8_t>(ofs)
  110. );
  111. }
  112. // Read the test data back in, converting to UCS-4 on the way in
  113. std::vector<wchar_t> from_file;
  114. {
  115. std::wifstream ifs;
  116. ifs.imbue(*utf8_locale);
  117. ifs.open("test.dat");
  118. wchar_t item = 0;
  119. // note can't use normal vector from iterator constructor because
  120. // dinkumware doesn't have it.
  121. for(;;){
  122. item = ifs.get();
  123. if(item == WEOF)
  124. break;
  125. //ifs >> item;
  126. //if(ifs.eof())
  127. // break;
  128. from_file.push_back(item);
  129. }
  130. }
  131. // compare the data read back in with the orginal
  132. #if ! defined(__BORLANDC__)
  133. // borland 5.60 complains about this
  134. BOOST_CHECK(from_file.size() == sizeof(td::wchar_encoding)/sizeof(wchar_t));
  135. #else
  136. // so use this instead
  137. BOOST_CHECK(from_file.size() == 6);
  138. #endif
  139. BOOST_CHECK(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
  140. // Send the UCS4_data back out, converting to UTF-8
  141. {
  142. std::wofstream ofs;
  143. ofs.imbue(*utf8_locale);
  144. ofs.open("test2.dat");
  145. std::copy(
  146. from_file.begin(),
  147. from_file.end(),
  148. boost::archive::iterators::ostream_iterator<wchar_t>(ofs)
  149. );
  150. }
  151. // Make sure that both files are the same
  152. {
  153. typedef boost::archive::iterators::istream_iterator<utf8_t> is_iter;
  154. is_iter end_iter;
  155. std::ifstream ifs1("test.dat");
  156. is_iter it1(ifs1);
  157. std::vector<utf8_t> data1;
  158. std::copy(it1, end_iter, std::back_inserter(data1));
  159. std::ifstream ifs2("test2.dat");
  160. is_iter it2(ifs2);
  161. std::vector<utf8_t> data2;
  162. std::copy(it2, end_iter, std::back_inserter(data2));
  163. BOOST_CHECK(data1 == data2);
  164. }
  165. // some libraries have trouble that only shows up with longer strings
  166. wchar_t * test3_data = L"\
  167. <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
  168. <!DOCTYPE boost_serialization>\
  169. <boost_serialization signature=\"serialization::archive\" version=\"3\">\
  170. <a class_id=\"0\" tracking_level=\"0\">\
  171. <b>1</b>\
  172. <f>96953204</f>\
  173. <g>177129195</g>\
  174. <l>1</l>\
  175. <m>5627</m>\
  176. <n>23010</n>\
  177. <o>7419</o>\
  178. <p>16212</p>\
  179. <q>4086</q>\
  180. <r>2749</r>\
  181. <c>-33</c>\
  182. <s>124</s>\
  183. <t>28</t>\
  184. <u>32225</u>\
  185. <v>17543</v>\
  186. <w>0.84431422</w>\
  187. <x>1.0170664757130923</x>\
  188. <y>tjbx</y>\
  189. <z>cuwjentqpkejp</z>\
  190. </a>\
  191. </boost_serialization>\
  192. ";
  193. // Send the UCS4_data back out, converting to UTF-8
  194. std::size_t l = std::wcslen(test3_data);
  195. {
  196. std::wofstream ofs;
  197. ofs.imbue(*utf8_locale);
  198. ofs.open("test3.dat");
  199. std::copy(
  200. test3_data,
  201. test3_data + l,
  202. boost::archive::iterators::ostream_iterator<wchar_t>(ofs)
  203. );
  204. }
  205. // Make sure that both files are the same
  206. {
  207. std::wifstream ifs;
  208. ifs.imbue(*utf8_locale);
  209. ifs.open("test3.dat");
  210. BOOST_CHECK(
  211. std::equal(
  212. test3_data,
  213. test3_data + l,
  214. boost::archive::iterators::istream_iterator<wchar_t>(ifs)
  215. )
  216. );
  217. }
  218. delete utf8_locale;
  219. return EXIT_SUCCESS;
  220. }