/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // utf8_codecvt_facet.cpp // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). // Distributed under the Boost Software License, Version 1.0. (See accompany- // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // See http://www.boost.org/libs/iostreams for documentation. //#include // for multi-byte converson routines // Jonathan Turkanis: // - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for // BOOST_IOSTREAMS_NO_WIDE_STREAMS; // - Derived from codecvt_helper instead of codecvt. #include #include #include #ifdef BOOST_IOSTREAMS_NO_LOCALES # error "C++ locales not supported on this platform" #else #include #include #include #include "./utf8_codecvt_facet.hpp" #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600) # pragma warn -sig // Conversion may lose significant digits # pragma warn -rng // Constant is out of range in comparison #endif /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // implementation for wchar_t // Translate incoming UTF-8 into UCS-4 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in( std::mbstate_t&, const char * from, const char * from_end, const char * & from_next, wchar_t * to, wchar_t * to_end, wchar_t * & to_next ) const { // Basic algorithm: The first octet determines how many // octets total make up the UCS-4 character. The remaining // "continuing octets" all begin with "10". To convert, subtract // the amount that specifies the number of octets from the first // octet. Subtract 0x80 (1000 0000) from each continuing octet, // then mash the whole lot together. Note that each continuing // octet only uses 6 bits as unique values, so only shift by // multiples of 6 to combine. while (from != from_end && to != to_end) { // Error checking on the first octet if (invalid_leading_octet(*from)){ from_next = from; to_next = to; return std::codecvt_base::error; } // The first octet is adjusted by a value dependent upon // the number of "continuing octets" encoding the character const int cont_octet_count = get_cont_octet_count(*from); const wchar_t octet1_modifier_table[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; // The unsigned char conversion is necessary in case char is // signed (I learned this the hard way) wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; // Invariants : // 1) At the start of the loop, 'i' continuing characters have been // processed // 2) *from points to the next continuing character to be processed. int i = 0; while(i != cont_octet_count && from != from_end) { // Error checking on continuing characters if (invalid_continuing_octet(*from)) { from_next = from; to_next = to; return std::codecvt_base::error; } ucs_result *= (1 << 6); // each continuing character has an extra (10xxxxxx)b attached to // it that must be removed. ucs_result += (unsigned char)(*from++) - 0x80; ++i; } // If the buffer ends with an incomplete unicode character... if (from == from_end && i != cont_octet_count) { // rewind "from" to before the current character translation from_next = from - (i+1); to_next = to; return std::codecvt_base::partial; } *to++ = ucs_result; } from_next = from; to_next = to; // Were we done converting or did we run out of destination space? if(from == from_end) return std::codecvt_base::ok; else return std::codecvt_base::partial; } std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out( std::mbstate_t &, const wchar_t * from, const wchar_t * from_end, const wchar_t * & from_next, char * to, char * to_end, char * & to_next ) const { // RG - consider merging this table with the other one const wchar_t octet1_modifier_table[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; while (from != from_end && to != to_end) { #define BOOST_NULL // Prevent macro expansion // Check for invalid UCS-4 character if (*from > std::numeric_limits::max BOOST_NULL ()) { from_next = from; to_next = to; return std::codecvt_base::error; } #undef BOOST_NULL int cont_octet_count = get_cont_octet_out_count(*from); // RG - comment this formula better int shift_exponent = (cont_octet_count) * 6; // Process the first character *to++ = octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent)); // Process the continuation characters // Invariants: At the start of the loop: // 1) 'i' continuing octets have been generated // 2) '*to' points to the next location to place an octet // 3) shift_exponent is 6 more than needed for the next octet int i = 0; while (i != cont_octet_count && to != to_end) { shift_exponent -= 6; *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)); ++i; } // If we filled up the out buffer before encoding the character if(to == to_end && i != cont_octet_count) { from_next = from; to_next = to - (i+1); return std::codecvt_base::partial; } ++from; } from_next = from; to_next = to; // Were we done or did we run out of destination space if(from == from_end) return std::codecvt_base::ok; else return std::codecvt_base::partial; } // How many char objects can I process to get <= max_limit // wchar_t objects? int utf8_codecvt_facet_wchar_t::do_length( BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &, const char * from, const char * from_end, std::size_t max_limit ) const throw() { // RG - this code is confusing! I need a better way to express it. // and test cases. // Invariants: // 1) last_octet_count has the size of the last measured character // 2) char_count holds the number of characters shown to fit // within the bounds so far (no greater than max_limit) // 3) from_next points to the octet 'last_octet_count' before the // last measured character. int last_octet_count=0; std::size_t char_count = 0; const char* from_next = from; // Use "<" because the buffer may represent incomplete characters while (from_next+last_octet_count <= from_end && char_count <= max_limit) { from_next += last_octet_count; last_octet_count = (get_octet_count(*from_next)); ++char_count; } return boost::numeric_cast(from_next - from_end); } unsigned int utf8_codecvt_facet_wchar_t::get_octet_count( unsigned char lead_octet ){ // if the 0-bit (MSB) is 0, then 1 character if (lead_octet <= 0x7f) return 1; // Otherwise the count number of consecutive 1 bits starting at MSB assert(0xc0 <= lead_octet && lead_octet <= 0xfd); if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; else return 6; } namespace { template int get_cont_octet_out_count_impl(wchar_t word){ if (word < 0x80) { return 0; } if (word < 0x800) { return 1; } return 2; } // note the following code will generate on some platforms where // wchar_t is defined as UCS2. The warnings are superfluous as // the specialization is never instantitiated with such compilers. template<> int get_cont_octet_out_count_impl<4>(wchar_t word) { if (word < 0x80) { return 0; } if (word < 0x800) { return 1; } if (word < 0x10000) { return 2; } if (word < 0x200000) { return 3; } if (word < 0x4000000) { return 4; } return 5; } } // namespace anonymous // How many "continuing octets" will be needed for this word // == total octets - 1. int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count( wchar_t word ) const { return get_cont_octet_out_count_impl(word); } #if 0 // not used? /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // implementation for char std::codecvt_base::result utf8_codecvt_facet_char::do_in( std::mbstate_t & state, const char * from, const char * from_end, const char * & from_next, char * to, char * to_end, char * & to_next ) const { while(from_next < from_end){ wchar_t w; wchar_t *wnext = & w; utf8_codecvt_facet_wchar_t::result ucs4_result; ucs4_result = base_class::do_in( state, from, from_end, from_next, wnext, wnext + 1, wnext ); if(codecvt_base::ok != ucs4_result) return ucs4_result; // if the conversion succeeds. int length = std::wctomb(to_next, w); assert(-1 != length); to_next += length; } return codecvt_base::ok; } std::codecvt_base::result utf8_codecvt_facet_char::do_out( mbstate_t & state, const char * from, const char * from_end, const char * & from_next, char * to, char * to_end, char * & to_next ) const { while(from_next < from_end){ wchar_t w; int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX); assert(-1 != result); from_next += result; utf8_codecvt_facet_wchar_t::result ucs4_result; const wchar_t *wptr = & w; ucs4_result = base_class::do_out( state, wptr, wptr+1, wptr, to_next, to_end, to_next ); if(codecvt_base::ok != ucs4_result) return ucs4_result; } return codecvt_base::ok; } // How many bytes objects can I process to get <= max_limit // char objects? int utf8_codecvt_facet_char::do_length( // it seems that the standard doesn't use const so these librarires // would be in error BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER utf8_codecvt_facet_wchar_t::mbstate_t & initial_state, const char * from_next, const char * from_end, std::size_t max_limit ) const { int total_length = 0; const char *from = from_next; mbstate_t state = initial_state; while(from_next < from_end){ wchar_t w; wchar_t *wnext = & w; utf8_codecvt_facet_wchar_t::result ucs4_result; ucs4_result = base_class::do_in( state, from_next, from_end, from_next, wnext, wnext + 1, wnext ); if(codecvt_base::ok != ucs4_result) break; char carray[MB_LENGTH_MAX]; std::size_t count = wctomb(carray, w); if(count > max_limit) break; max_limit -= count; total_length = from_next - from; } return total_length; } #endif #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS