utf.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
  9. #define BOOST_LOCALE_UTF_HPP_INCLUDED
  10. #include <boost/cstdint.hpp>
  11. namespace boost {
  12. namespace locale {
  13. ///
  14. /// \brief Namespace that holds basic operations on UTF encoded sequences
  15. ///
  16. /// All functions defined in this namespace do not require linking with Boost.Locale library
  17. ///
  18. namespace utf {
  19. /// \cond INTERNAL
  20. #ifdef __GNUC__
  21. # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
  22. # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
  23. #else
  24. # define BOOST_LOCALE_LIKELY(x) (x)
  25. # define BOOST_LOCALE_UNLIKELY(x) (x)
  26. #endif
  27. /// \endcond
  28. ///
  29. /// \brief The integral type that can hold a Unicode code point
  30. ///
  31. typedef uint32_t code_point;
  32. ///
  33. /// \brief Special constant that defines illegal code point
  34. ///
  35. static const code_point illegal = 0xFFFFFFFFu;
  36. ///
  37. /// \brief Special constant that defines incomplete code point
  38. ///
  39. static const code_point incomplete = 0xFFFFFFFEu;
  40. ///
  41. /// \brief the function checks if \a v is a valid code point
  42. ///
  43. inline bool is_valid_codepoint(code_point v)
  44. {
  45. if(v>0x10FFFF)
  46. return false;
  47. if(0xD800 <=v && v<= 0xDFFF) // surragates
  48. return false;
  49. return true;
  50. }
  51. #ifdef BOOST_LOCALE_DOXYGEN
  52. ///
  53. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  54. ///
  55. template<typename CharType,int size=sizeof(CharType)>
  56. struct utf_traits {
  57. ///
  58. /// The type of the character
  59. ///
  60. typedef CharType char_type;
  61. ///
  62. /// Read one code point from the range [p,e) and return it.
  63. ///
  64. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  65. /// - If illegal sequence detected returns \ref illegal
  66. ///
  67. /// Requirements
  68. ///
  69. /// - Iterator is valid input iterator
  70. ///
  71. /// Postconditions
  72. ///
  73. /// - p points to the last consumed character
  74. ///
  75. template<typename Iterator>
  76. static code_point decode(Iterator &p,Iterator e);
  77. ///
  78. /// Maximal width of valid sequence in the code units:
  79. ///
  80. /// - UTF-8 - 4
  81. /// - UTF-16 - 2
  82. /// - UTF-32 - 1
  83. ///
  84. static const int max_width;
  85. ///
  86. /// The width of specific code point in the code units.
  87. ///
  88. /// Requirement: value is a valid Unicode code point
  89. /// Returns value in range [1..max_width]
  90. ///
  91. static int width(code_point value);
  92. ///
  93. /// Get the size of the trail part of variable length encoded sequence.
  94. ///
  95. /// Returns -1 if C is not valid lead character
  96. ///
  97. static int trail_length(char_type c);
  98. ///
  99. /// Returns true if c is trail code unit, always false for UTF-32
  100. ///
  101. static bool is_trail(char_type c);
  102. ///
  103. /// Returns true if c is lead code unit, always true of UTF-32
  104. ///
  105. static bool is_lead(char_type c);
  106. ///
  107. /// Convert valid Unicode code point \a value to the UTF sequence.
  108. ///
  109. /// Requirements:
  110. ///
  111. /// - \a value is valid code point
  112. /// - \a out is an output iterator should be able to accept at least width(value) units
  113. ///
  114. /// Returns the iterator past the last written code unit.
  115. ///
  116. template<typename Iterator>
  117. static Iterator encode(code_point value,Iterator out);
  118. ///
  119. /// Decodes valid UTF sequence that is pointed by p into code point.
  120. ///
  121. /// If the sequence is invalid or points to end the behavior is undefined
  122. ///
  123. template<typename Iterator>
  124. static code_point decode_valid(Iterator &p);
  125. };
  126. #else
  127. template<typename CharType,int size=sizeof(CharType)>
  128. struct utf_traits;
  129. template<typename CharType>
  130. struct utf_traits<CharType,1> {
  131. typedef CharType char_type;
  132. static int trail_length(char_type ci)
  133. {
  134. unsigned char c = ci;
  135. if(c < 128)
  136. return 0;
  137. if(BOOST_LOCALE_UNLIKELY(c < 194))
  138. return -1;
  139. if(c < 224)
  140. return 1;
  141. if(c < 240)
  142. return 2;
  143. if(BOOST_LOCALE_LIKELY(c <=244))
  144. return 3;
  145. return -1;
  146. }
  147. static const int max_width = 4;
  148. static int width(code_point value)
  149. {
  150. if(value <=0x7F) {
  151. return 1;
  152. }
  153. else if(value <=0x7FF) {
  154. return 2;
  155. }
  156. else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
  157. return 3;
  158. }
  159. else {
  160. return 4;
  161. }
  162. }
  163. static bool is_trail(char_type ci)
  164. {
  165. unsigned char c=ci;
  166. return (c & 0xC0)==0x80;
  167. }
  168. static bool is_lead(char_type ci)
  169. {
  170. return !is_trail(ci);
  171. }
  172. template<typename Iterator>
  173. static code_point decode(Iterator &p,Iterator e)
  174. {
  175. if(BOOST_LOCALE_UNLIKELY(p==e))
  176. return incomplete;
  177. unsigned char lead = *p++;
  178. // First byte is fully validated here
  179. int trail_size = trail_length(lead);
  180. if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
  181. return illegal;
  182. //
  183. // Ok as only ASCII may be of size = 0
  184. // also optimize for ASCII text
  185. //
  186. if(trail_size == 0)
  187. return lead;
  188. code_point c = lead & ((1<<(6-trail_size))-1);
  189. // Read the rest
  190. unsigned char tmp;
  191. switch(trail_size) {
  192. case 3:
  193. if(BOOST_LOCALE_UNLIKELY(p==e))
  194. return incomplete;
  195. tmp = *p++;
  196. if (!is_trail(tmp))
  197. return illegal;
  198. c = (c << 6) | ( tmp & 0x3F);
  199. case 2:
  200. if(BOOST_LOCALE_UNLIKELY(p==e))
  201. return incomplete;
  202. tmp = *p++;
  203. if (!is_trail(tmp))
  204. return illegal;
  205. c = (c << 6) | ( tmp & 0x3F);
  206. case 1:
  207. if(BOOST_LOCALE_UNLIKELY(p==e))
  208. return incomplete;
  209. tmp = *p++;
  210. if (!is_trail(tmp))
  211. return illegal;
  212. c = (c << 6) | ( tmp & 0x3F);
  213. }
  214. // Check code point validity: no surrogates and
  215. // valid range
  216. if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
  217. return illegal;
  218. // make sure it is the most compact representation
  219. if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
  220. return illegal;
  221. return c;
  222. }
  223. template<typename Iterator>
  224. static code_point decode_valid(Iterator &p)
  225. {
  226. unsigned char lead = *p++;
  227. if(lead < 192)
  228. return lead;
  229. int trail_size;
  230. if(lead < 224)
  231. trail_size = 1;
  232. else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
  233. trail_size = 2;
  234. else
  235. trail_size = 3;
  236. code_point c = lead & ((1<<(6-trail_size))-1);
  237. switch(trail_size) {
  238. case 3:
  239. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  240. case 2:
  241. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  242. case 1:
  243. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  244. }
  245. return c;
  246. }
  247. template<typename Iterator>
  248. static Iterator encode(code_point value,Iterator out)
  249. {
  250. if(value <= 0x7F) {
  251. *out++ = static_cast<char_type>(value);
  252. }
  253. else if(value <= 0x7FF) {
  254. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  255. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  256. }
  257. else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
  258. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  259. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  260. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  261. }
  262. else {
  263. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  264. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  265. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  266. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  267. }
  268. return out;
  269. }
  270. }; // utf8
  271. template<typename CharType>
  272. struct utf_traits<CharType,2> {
  273. typedef CharType char_type;
  274. // See RFC 2781
  275. static bool is_first_surrogate(uint16_t x)
  276. {
  277. return 0xD800 <=x && x<= 0xDBFF;
  278. }
  279. static bool is_second_surrogate(uint16_t x)
  280. {
  281. return 0xDC00 <=x && x<= 0xDFFF;
  282. }
  283. static code_point combine_surrogate(uint16_t w1,uint16_t w2)
  284. {
  285. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  286. }
  287. static int trail_length(char_type c)
  288. {
  289. if(is_first_surrogate(c))
  290. return 1;
  291. if(is_second_surrogate(c))
  292. return -1;
  293. return 0;
  294. }
  295. ///
  296. /// Returns true if c is trail code unit, always false for UTF-32
  297. ///
  298. static bool is_trail(char_type c)
  299. {
  300. return is_second_surrogate(c);
  301. }
  302. ///
  303. /// Returns true if c is lead code unit, always true of UTF-32
  304. ///
  305. static bool is_lead(char_type c)
  306. {
  307. return !is_second_surrogate(c);
  308. }
  309. template<typename It>
  310. static code_point decode(It &current,It last)
  311. {
  312. if(BOOST_LOCALE_UNLIKELY(current == last))
  313. return incomplete;
  314. uint16_t w1=*current++;
  315. if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
  316. return w1;
  317. }
  318. if(w1 > 0xDBFF)
  319. return illegal;
  320. if(current==last)
  321. return incomplete;
  322. uint16_t w2=*current++;
  323. if(w2 < 0xDC00 || 0xDFFF < w2)
  324. return illegal;
  325. return combine_surrogate(w1,w2);
  326. }
  327. template<typename It>
  328. static code_point decode_valid(It &current)
  329. {
  330. uint16_t w1=*current++;
  331. if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
  332. return w1;
  333. }
  334. uint16_t w2=*current++;
  335. return combine_surrogate(w1,w2);
  336. }
  337. static const int max_width = 2;
  338. static int width(code_point u)
  339. {
  340. return u>=0x10000 ? 2 : 1;
  341. }
  342. template<typename It>
  343. static It encode(code_point u,It out)
  344. {
  345. if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
  346. *out++ = static_cast<char_type>(u);
  347. }
  348. else {
  349. u -= 0x10000;
  350. *out++ = static_cast<char_type>(0xD800 | (u>>10));
  351. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  352. }
  353. return out;
  354. }
  355. }; // utf16;
  356. template<typename CharType>
  357. struct utf_traits<CharType,4> {
  358. typedef CharType char_type;
  359. static int trail_length(char_type c)
  360. {
  361. if(is_valid_codepoint(c))
  362. return 0;
  363. return -1;
  364. }
  365. static bool is_trail(char_type /*c*/)
  366. {
  367. return false;
  368. }
  369. static bool is_lead(char_type /*c*/)
  370. {
  371. return true;
  372. }
  373. template<typename It>
  374. static code_point decode_valid(It &current)
  375. {
  376. return *current++;
  377. }
  378. template<typename It>
  379. static code_point decode(It &current,It last)
  380. {
  381. if(BOOST_LOCALE_UNLIKELY(current == last))
  382. return boost::locale::utf::incomplete;
  383. code_point c=*current++;
  384. if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
  385. return boost::locale::utf::illegal;
  386. return c;
  387. }
  388. static const int max_width = 1;
  389. static int width(code_point /*u*/)
  390. {
  391. return 1;
  392. }
  393. template<typename It>
  394. static It encode(code_point u,It out)
  395. {
  396. *out++ = static_cast<char_type>(u);
  397. return out;
  398. }
  399. }; // utf32
  400. #endif
  401. } // utf
  402. } // locale
  403. } // boost
  404. #endif
  405. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4