utf8_codecvt_facet.hpp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. #ifndef BOOST_UTF8_CODECVT_FACET_HPP
  2. #define BOOST_UTF8_CODECVT_FACET_HPP
  3. #include <boost/iostreams/detail/config/wide_streams.hpp>
  4. #ifdef BOOST_IOSTREAMS_NO_WIDE_STREAMS
  5. # error wide streams not supported on this platform
  6. #endif
  7. // MS compatible compilers support #pragma once
  8. #if defined(_MSC_VER)
  9. # pragma once
  10. #endif
  11. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  12. // utf8_codecvt_facet.hpp
  13. // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
  14. // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
  15. // Distributed under the Boost Software License, Version 1.0. (See accompany-
  16. // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  17. // Note:(Robert Ramey). I have made the following alterations in the original
  18. // code.
  19. // a) Rendered utf8_codecvt<wchar_t, char> with using templates
  20. // b) Move longer functions outside class definition to prevent inlining
  21. // and make code smaller
  22. // c) added on a derived class to permit translation to/from current
  23. // locale to utf8
  24. // See http://www.boost.org for updates, documentation, and revision history.
  25. // archives stored as text - note these ar templated on the basic
  26. // stream templates to accommodate wide (and other?) kind of characters
  27. //
  28. // note the fact that on libraries without wide characters, ostream is
  29. // is not a specialization of basic_ostream which in fact is not defined
  30. // in such cases. So we can't use basic_ostream<OStream::char_type> but rather
  31. // use two template parameters
  32. //
  33. // utf8_codecvt_facet
  34. // This is an implementation of a std::codecvt facet for translating
  35. // from UTF-8 externally to UCS-4. Note that this is not tied to
  36. // any specific types in order to allow customization on platforms
  37. // where wchar_t is not big enough.
  38. //
  39. // NOTES: The current implementation jumps through some unpleasant hoops in
  40. // order to deal with signed character types. As a std::codecvt_base::result,
  41. // it is necessary for the ExternType to be convertible to unsigned char.
  42. // I chose not to tie the extern_type explicitly to char. But if any combination
  43. // of types other than <wchar_t,char_t> is used, then std::codecvt must be
  44. // specialized on those types for this to work.
  45. #include <locale>
  46. #include <cstddef> // size_t
  47. #include <cwchar> // mbstate_t
  48. #include <boost/integer_traits.hpp>
  49. #include <boost/iostreams/detail/config/wide_streams.hpp>
  50. #include <boost/iostreams/detail/codecvt_helper.hpp>
  51. // maximum lenght of a multibyte string
  52. #define MB_LENGTH_MAX 8
  53. struct utf8_codecvt_facet_wchar_t
  54. : public boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
  55. {
  56. public:
  57. explicit utf8_codecvt_facet_wchar_t(std::size_t no_locale_manage = 0)
  58. : boost::iostreams::detail::codecvt_helper<wchar_t, char, std::mbstate_t>
  59. (no_locale_manage)
  60. { }
  61. protected:
  62. virtual std::codecvt_base::result do_in(
  63. std::mbstate_t& state,
  64. const char * from,
  65. const char * from_end,
  66. const char * & from_next,
  67. wchar_t * to,
  68. wchar_t * to_end,
  69. wchar_t*& to_next
  70. ) const;
  71. virtual std::codecvt_base::result do_out(
  72. std::mbstate_t & state, const wchar_t * from,
  73. const wchar_t * from_end, const wchar_t* & from_next,
  74. char * to, char * to_end, char * & to_next
  75. ) const;
  76. bool invalid_continuing_octet(unsigned char octet_1) const {
  77. return (octet_1 < 0x80|| 0xbf< octet_1);
  78. }
  79. bool invalid_leading_octet(unsigned char octet_1) const {
  80. return (0x7f < octet_1 && octet_1 < 0xc0) ||
  81. (octet_1 > 0xfd);
  82. }
  83. // continuing octets = octets except for the leading octet
  84. static unsigned int get_cont_octet_count(unsigned char lead_octet) {
  85. return get_octet_count(lead_octet) - 1;
  86. }
  87. static unsigned int get_octet_count(unsigned char lead_octet);
  88. // How many "continuing octets" will be needed for this word
  89. // == total octets - 1.
  90. int get_cont_octet_out_count(wchar_t word) const ;
  91. virtual bool do_always_noconv() const throw() { return false; }
  92. // UTF-8 isn't really stateful since we rewind on partial conversions
  93. virtual std::codecvt_base::result do_unshift(
  94. std::mbstate_t&,
  95. char * from,
  96. char * /* to */,
  97. char * & next
  98. ) const{
  99. next = from;
  100. return ok;
  101. }
  102. virtual int do_encoding() const throw() {
  103. const int variable_byte_external_encoding=0;
  104. return variable_byte_external_encoding;
  105. }
  106. // How many char objects can I process to get <= max_limit
  107. // wchar_t objects?
  108. virtual int do_length(
  109. BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
  110. const char * from,
  111. const char * from_end,
  112. std::size_t max_limit
  113. ) const throw();
  114. // Largest possible value do_length(state,from,from_end,1) could return.
  115. virtual int do_max_length() const throw () {
  116. return 6; // largest UTF-8 encoding of a UCS-4 character
  117. }
  118. };
  119. #if 0 // not used - incorrect in any case
  120. // Robert Ramey - use the above to make a code converter from multi-byte
  121. // char strings to utf8 encoding
  122. struct utf8_codecvt_facet_char : public utf8_codecvt_facet_wchar_t
  123. {
  124. typedef utf8_codecvt_facet_wchar_t base_class;
  125. public:
  126. explicit utf8_codecvt_facet_char(std::size_t no_locale_manage=0)
  127. : base_class(no_locale_manage)
  128. {}
  129. protected:
  130. virtual std::codecvt_base::result do_in(
  131. std::mbstate_t & state,
  132. const char * from,
  133. const char * from_end,
  134. const char * & from_next,
  135. char * to,
  136. char * to_end,
  137. char * & to_next
  138. ) const;
  139. virtual std::codecvt_base::result do_out(
  140. std::mbstate_t & state,
  141. const char * from,
  142. const char * from_end,
  143. const char* & from_next,
  144. char * to,
  145. char * to_end,
  146. char * & to_next
  147. ) const;
  148. // How many char objects can I process to get <= max_limit
  149. // char objects?
  150. virtual int do_length(
  151. const std::mbstate_t&,
  152. const char * from,
  153. const char * from_end,
  154. std::size_t max_limit
  155. ) const;
  156. };
  157. #endif
  158. template<class Internal, class External>
  159. struct utf8_codecvt_facet
  160. {};
  161. template<>
  162. struct utf8_codecvt_facet<wchar_t, char>
  163. : public utf8_codecvt_facet_wchar_t
  164. {};
  165. #if 0
  166. template<>
  167. struct utf8_codecvt_facet<char, char>
  168. : public utf8_codecvt_facet_char
  169. {};
  170. #endif
  171. #endif // BOOST_UTF8_CODECVT_FACET_HPP