123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460 |
- //
- // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
- //
- // Distributed under the Boost Software License, Version 1.0. (See
- // accompanying file LICENSE_1_0.txt or copy at
- // http://www.boost.org/LICENSE_1_0.txt)
- //
- #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
- #define BOOST_LOCALE_UTF_HPP_INCLUDED
- #include <boost/cstdint.hpp>
- namespace boost {
- namespace locale {
- ///
- /// \brief Namespace that holds basic operations on UTF encoded sequences
- ///
- /// All functions defined in this namespace do not require linking with Boost.Locale library
- ///
- namespace utf {
- /// \cond INTERNAL
- #ifdef __GNUC__
- # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
- # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
- #else
- # define BOOST_LOCALE_LIKELY(x) (x)
- # define BOOST_LOCALE_UNLIKELY(x) (x)
- #endif
- /// \endcond
- ///
- /// \brief The integral type that can hold a Unicode code point
- ///
- typedef uint32_t code_point;
- ///
- /// \brief Special constant that defines illegal code point
- ///
- static const code_point illegal = 0xFFFFFFFFu;
- ///
- /// \brief Special constant that defines incomplete code point
- ///
- static const code_point incomplete = 0xFFFFFFFEu;
- ///
- /// \brief the function checks if \a v is a valid code point
- ///
- inline bool is_valid_codepoint(code_point v)
- {
- if(v>0x10FFFF)
- return false;
- if(0xD800 <=v && v<= 0xDFFF) // surragates
- return false;
- return true;
- }
- #ifdef BOOST_LOCALE_DOXYGEN
- ///
- /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
- ///
- template<typename CharType,int size=sizeof(CharType)>
- struct utf_traits {
- ///
- /// The type of the character
- ///
- typedef CharType char_type;
- ///
- /// Read one code point from the range [p,e) and return it.
- ///
- /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
- /// - If illegal sequence detected returns \ref illegal
- ///
- /// Requirements
- ///
- /// - Iterator is valid input iterator
- ///
- /// Postconditions
- ///
- /// - p points to the last consumed character
- ///
- template<typename Iterator>
- static code_point decode(Iterator &p,Iterator e);
- ///
- /// Maximal width of valid sequence in the code units:
- ///
- /// - UTF-8 - 4
- /// - UTF-16 - 2
- /// - UTF-32 - 1
- ///
- static const int max_width;
- ///
- /// The width of specific code point in the code units.
- ///
- /// Requirement: value is a valid Unicode code point
- /// Returns value in range [1..max_width]
- ///
- static int width(code_point value);
- ///
- /// Get the size of the trail part of variable length encoded sequence.
- ///
- /// Returns -1 if C is not valid lead character
- ///
- static int trail_length(char_type c);
- ///
- /// Returns true if c is trail code unit, always false for UTF-32
- ///
- static bool is_trail(char_type c);
- ///
- /// Returns true if c is lead code unit, always true of UTF-32
- ///
- static bool is_lead(char_type c);
- ///
- /// Convert valid Unicode code point \a value to the UTF sequence.
- ///
- /// Requirements:
- ///
- /// - \a value is valid code point
- /// - \a out is an output iterator should be able to accept at least width(value) units
- ///
- /// Returns the iterator past the last written code unit.
- ///
- template<typename Iterator>
- static Iterator encode(code_point value,Iterator out);
- ///
- /// Decodes valid UTF sequence that is pointed by p into code point.
- ///
- /// If the sequence is invalid or points to end the behavior is undefined
- ///
- template<typename Iterator>
- static code_point decode_valid(Iterator &p);
- };
-
- #else
- template<typename CharType,int size=sizeof(CharType)>
- struct utf_traits;
- template<typename CharType>
- struct utf_traits<CharType,1> {
- typedef CharType char_type;
-
- static int trail_length(char_type ci)
- {
- unsigned char c = ci;
- if(c < 128)
- return 0;
- if(BOOST_LOCALE_UNLIKELY(c < 194))
- return -1;
- if(c < 224)
- return 1;
- if(c < 240)
- return 2;
- if(BOOST_LOCALE_LIKELY(c <=244))
- return 3;
- return -1;
- }
-
- static const int max_width = 4;
- static int width(code_point value)
- {
- if(value <=0x7F) {
- return 1;
- }
- else if(value <=0x7FF) {
- return 2;
- }
- else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
- return 3;
- }
- else {
- return 4;
- }
- }
- static bool is_trail(char_type ci)
- {
- unsigned char c=ci;
- return (c & 0xC0)==0x80;
- }
- static bool is_lead(char_type ci)
- {
- return !is_trail(ci);
- }
-
- template<typename Iterator>
- static code_point decode(Iterator &p,Iterator e)
- {
- if(BOOST_LOCALE_UNLIKELY(p==e))
- return incomplete;
- unsigned char lead = *p++;
- // First byte is fully validated here
- int trail_size = trail_length(lead);
- if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
- return illegal;
- //
- // Ok as only ASCII may be of size = 0
- // also optimize for ASCII text
- //
- if(trail_size == 0)
- return lead;
-
- code_point c = lead & ((1<<(6-trail_size))-1);
- // Read the rest
- unsigned char tmp;
- switch(trail_size) {
- case 3:
- if(BOOST_LOCALE_UNLIKELY(p==e))
- return incomplete;
- tmp = *p++;
- if (!is_trail(tmp))
- return illegal;
- c = (c << 6) | ( tmp & 0x3F);
- case 2:
- if(BOOST_LOCALE_UNLIKELY(p==e))
- return incomplete;
- tmp = *p++;
- if (!is_trail(tmp))
- return illegal;
- c = (c << 6) | ( tmp & 0x3F);
- case 1:
- if(BOOST_LOCALE_UNLIKELY(p==e))
- return incomplete;
- tmp = *p++;
- if (!is_trail(tmp))
- return illegal;
- c = (c << 6) | ( tmp & 0x3F);
- }
- // Check code point validity: no surrogates and
- // valid range
- if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
- return illegal;
- // make sure it is the most compact representation
- if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
- return illegal;
- return c;
- }
-
- template<typename Iterator>
- static code_point decode_valid(Iterator &p)
- {
- unsigned char lead = *p++;
- if(lead < 192)
- return lead;
- int trail_size;
- if(lead < 224)
- trail_size = 1;
- else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
- trail_size = 2;
- else
- trail_size = 3;
-
- code_point c = lead & ((1<<(6-trail_size))-1);
- switch(trail_size) {
- case 3:
- c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
- case 2:
- c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
- case 1:
- c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
- }
- return c;
- }
- template<typename Iterator>
- static Iterator encode(code_point value,Iterator out)
- {
- if(value <= 0x7F) {
- *out++ = static_cast<char_type>(value);
- }
- else if(value <= 0x7FF) {
- *out++ = static_cast<char_type>((value >> 6) | 0xC0);
- *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
- }
- else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
- *out++ = static_cast<char_type>((value >> 12) | 0xE0);
- *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
- *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
- }
- else {
- *out++ = static_cast<char_type>((value >> 18) | 0xF0);
- *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
- *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
- *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
- }
- return out;
- }
- }; // utf8
- template<typename CharType>
- struct utf_traits<CharType,2> {
- typedef CharType char_type;
- // See RFC 2781
- static bool is_first_surrogate(uint16_t x)
- {
- return 0xD800 <=x && x<= 0xDBFF;
- }
- static bool is_second_surrogate(uint16_t x)
- {
- return 0xDC00 <=x && x<= 0xDFFF;
- }
- static code_point combine_surrogate(uint16_t w1,uint16_t w2)
- {
- return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
- }
- static int trail_length(char_type c)
- {
- if(is_first_surrogate(c))
- return 1;
- if(is_second_surrogate(c))
- return -1;
- return 0;
- }
- ///
- /// Returns true if c is trail code unit, always false for UTF-32
- ///
- static bool is_trail(char_type c)
- {
- return is_second_surrogate(c);
- }
- ///
- /// Returns true if c is lead code unit, always true of UTF-32
- ///
- static bool is_lead(char_type c)
- {
- return !is_second_surrogate(c);
- }
- template<typename It>
- static code_point decode(It ¤t,It last)
- {
- if(BOOST_LOCALE_UNLIKELY(current == last))
- return incomplete;
- uint16_t w1=*current++;
- if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
- return w1;
- }
- if(w1 > 0xDBFF)
- return illegal;
- if(current==last)
- return incomplete;
- uint16_t w2=*current++;
- if(w2 < 0xDC00 || 0xDFFF < w2)
- return illegal;
- return combine_surrogate(w1,w2);
- }
- template<typename It>
- static code_point decode_valid(It ¤t)
- {
- uint16_t w1=*current++;
- if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
- return w1;
- }
- uint16_t w2=*current++;
- return combine_surrogate(w1,w2);
- }
- static const int max_width = 2;
- static int width(code_point u)
- {
- return u>=0x10000 ? 2 : 1;
- }
- template<typename It>
- static It encode(code_point u,It out)
- {
- if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
- *out++ = static_cast<char_type>(u);
- }
- else {
- u -= 0x10000;
- *out++ = static_cast<char_type>(0xD800 | (u>>10));
- *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
- }
- return out;
- }
- }; // utf16;
-
- template<typename CharType>
- struct utf_traits<CharType,4> {
- typedef CharType char_type;
- static int trail_length(char_type c)
- {
- if(is_valid_codepoint(c))
- return 0;
- return -1;
- }
- static bool is_trail(char_type /*c*/)
- {
- return false;
- }
- static bool is_lead(char_type /*c*/)
- {
- return true;
- }
- template<typename It>
- static code_point decode_valid(It ¤t)
- {
- return *current++;
- }
- template<typename It>
- static code_point decode(It ¤t,It last)
- {
- if(BOOST_LOCALE_UNLIKELY(current == last))
- return boost::locale::utf::incomplete;
- code_point c=*current++;
- if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
- return boost::locale::utf::illegal;
- return c;
- }
- static const int max_width = 1;
- static int width(code_point /*u*/)
- {
- return 1;
- }
- template<typename It>
- static It encode(code_point u,It out)
- {
- *out++ = static_cast<char_type>(u);
- return out;
- }
- }; // utf32
- #endif
- } // utf
- } // locale
- } // boost
- #endif
- // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
|