unicode_iterator.hpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE unicode_iterator.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
  16. */
  17. /****************************************************************************
  18. Contents:
  19. ~~~~~~~~~
  20. 1) Read Only, Input Adapters:
  21. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  22. template <class BaseIterator, class U8Type = ::boost::uint8_t>
  23. class u32_to_u8_iterator;
  24. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
  25. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  26. class u8_to_u32_iterator;
  27. Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
  28. template <class BaseIterator, class U16Type = ::boost::uint16_t>
  29. class u32_to_u16_iterator;
  30. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
  31. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  32. class u16_to_u32_iterator;
  33. Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
  34. 2) Single pass output iterator adapters:
  35. template <class BaseIterator>
  36. class utf8_output_iterator;
  37. Accepts UTF-32 code points and forwards them on as UTF-8 code points.
  38. template <class BaseIterator>
  39. class utf16_output_iterator;
  40. Accepts UTF-32 code points and forwards them on as UTF-16 code points.
  41. ****************************************************************************/
  42. #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
  43. #define BOOST_REGEX_UNICODE_ITERATOR_HPP
  44. #include <boost/cstdint.hpp>
  45. #include <boost/assert.hpp>
  46. #include <boost/iterator/iterator_facade.hpp>
  47. #include <boost/static_assert.hpp>
  48. #include <boost/throw_exception.hpp>
  49. #include <stdexcept>
  50. #ifndef BOOST_NO_STD_LOCALE
  51. #include <sstream>
  52. #include <ios>
  53. #endif
  54. #include <limits.h> // CHAR_BIT
  55. namespace boost{
  56. namespace detail{
  57. static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
  58. static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
  59. static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
  60. inline bool is_high_surrogate(::boost::uint16_t v)
  61. {
  62. return (v & 0xFFFFFC00u) == 0xd800u;
  63. }
  64. inline bool is_low_surrogate(::boost::uint16_t v)
  65. {
  66. return (v & 0xFFFFFC00u) == 0xdc00u;
  67. }
  68. template <class T>
  69. inline bool is_surrogate(T v)
  70. {
  71. return (v & 0xFFFFF800u) == 0xd800;
  72. }
  73. inline unsigned utf8_byte_count(boost::uint8_t c)
  74. {
  75. // if the most significant bit with a zero in it is in position
  76. // 8-N then there are N bytes in this UTF-8 sequence:
  77. boost::uint8_t mask = 0x80u;
  78. unsigned result = 0;
  79. while(c & mask)
  80. {
  81. ++result;
  82. mask >>= 1;
  83. }
  84. return (result == 0) ? 1 : ((result > 4) ? 4 : result);
  85. }
  86. inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
  87. {
  88. return utf8_byte_count(c) - 1;
  89. }
  90. #ifdef BOOST_MSVC
  91. #pragma warning(push)
  92. #pragma warning(disable:4100)
  93. #endif
  94. #ifndef BOOST_NO_EXCEPTIONS
  95. BOOST_NORETURN
  96. #endif
  97. inline void invalid_utf32_code_point(::boost::uint32_t val)
  98. {
  99. #ifndef BOOST_NO_STD_LOCALE
  100. std::stringstream ss;
  101. ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
  102. std::out_of_range e(ss.str());
  103. #else
  104. std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
  105. #endif
  106. boost::throw_exception(e);
  107. }
  108. #ifdef BOOST_MSVC
  109. #pragma warning(pop)
  110. #endif
  111. } // namespace detail
  112. template <class BaseIterator, class U16Type = ::boost::uint16_t>
  113. class u32_to_u16_iterator
  114. : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
  115. {
  116. typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
  117. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  118. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  119. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
  120. BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
  121. #endif
  122. public:
  123. typename base_type::reference
  124. dereference()const
  125. {
  126. if(m_current == 2)
  127. extract_current();
  128. return m_values[m_current];
  129. }
  130. bool equal(const u32_to_u16_iterator& that)const
  131. {
  132. if(m_position == that.m_position)
  133. {
  134. // Both m_currents must be equal, or both even
  135. // this is the same as saying their sum must be even:
  136. return (m_current + that.m_current) & 1u ? false : true;
  137. }
  138. return false;
  139. }
  140. void increment()
  141. {
  142. // if we have a pending read then read now, so that we know whether
  143. // to skip a position, or move to a low-surrogate:
  144. if(m_current == 2)
  145. {
  146. // pending read:
  147. extract_current();
  148. }
  149. // move to the next surrogate position:
  150. ++m_current;
  151. // if we've reached the end skip a position:
  152. if(m_values[m_current] == 0)
  153. {
  154. m_current = 2;
  155. ++m_position;
  156. }
  157. }
  158. void decrement()
  159. {
  160. if(m_current != 1)
  161. {
  162. // decrementing an iterator always leads to a valid position:
  163. --m_position;
  164. extract_current();
  165. m_current = m_values[1] ? 1 : 0;
  166. }
  167. else
  168. {
  169. m_current = 0;
  170. }
  171. }
  172. BaseIterator base()const
  173. {
  174. return m_position;
  175. }
  176. // construct:
  177. u32_to_u16_iterator() : m_position(), m_current(0)
  178. {
  179. m_values[0] = 0;
  180. m_values[1] = 0;
  181. m_values[2] = 0;
  182. }
  183. u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
  184. {
  185. m_values[0] = 0;
  186. m_values[1] = 0;
  187. m_values[2] = 0;
  188. }
  189. private:
  190. void extract_current()const
  191. {
  192. // begin by checking for a code point out of range:
  193. ::boost::uint32_t v = *m_position;
  194. if(v >= 0x10000u)
  195. {
  196. if(v > 0x10FFFFu)
  197. detail::invalid_utf32_code_point(*m_position);
  198. // split into two surrogates:
  199. m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
  200. m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  201. m_current = 0;
  202. BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
  203. BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
  204. }
  205. else
  206. {
  207. // 16-bit code point:
  208. m_values[0] = static_cast<U16Type>(*m_position);
  209. m_values[1] = 0;
  210. m_current = 0;
  211. // value must not be a surrogate:
  212. if(detail::is_surrogate(m_values[0]))
  213. detail::invalid_utf32_code_point(*m_position);
  214. }
  215. }
  216. BaseIterator m_position;
  217. mutable U16Type m_values[3];
  218. mutable unsigned m_current;
  219. };
  220. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  221. class u16_to_u32_iterator
  222. : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
  223. {
  224. typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
  225. // special values for pending iterator reads:
  226. BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
  227. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  228. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  229. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
  230. BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
  231. #endif
  232. public:
  233. typename base_type::reference
  234. dereference()const
  235. {
  236. if(m_value == pending_read)
  237. extract_current();
  238. return m_value;
  239. }
  240. bool equal(const u16_to_u32_iterator& that)const
  241. {
  242. return m_position == that.m_position;
  243. }
  244. void increment()
  245. {
  246. // skip high surrogate first if there is one:
  247. if(detail::is_high_surrogate(*m_position)) ++m_position;
  248. ++m_position;
  249. m_value = pending_read;
  250. }
  251. void decrement()
  252. {
  253. --m_position;
  254. // if we have a low surrogate then go back one more:
  255. if(detail::is_low_surrogate(*m_position))
  256. --m_position;
  257. m_value = pending_read;
  258. }
  259. BaseIterator base()const
  260. {
  261. return m_position;
  262. }
  263. // construct:
  264. u16_to_u32_iterator() : m_position()
  265. {
  266. m_value = pending_read;
  267. }
  268. u16_to_u32_iterator(BaseIterator b) : m_position(b)
  269. {
  270. m_value = pending_read;
  271. }
  272. //
  273. // Range checked version:
  274. //
  275. u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  276. {
  277. m_value = pending_read;
  278. //
  279. // The range must not start with a low surrogate, or end in a high surrogate,
  280. // otherwise we run the risk of running outside the underlying input range.
  281. // Likewise b must not be located at a low surrogate.
  282. //
  283. boost::uint16_t val;
  284. if(start != end)
  285. {
  286. if((b != start) && (b != end))
  287. {
  288. val = *b;
  289. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  290. invalid_code_point(val);
  291. }
  292. val = *start;
  293. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  294. invalid_code_point(val);
  295. val = *--end;
  296. if(detail::is_high_surrogate(val))
  297. invalid_code_point(val);
  298. }
  299. }
  300. private:
  301. static void invalid_code_point(::boost::uint16_t val)
  302. {
  303. #ifndef BOOST_NO_STD_LOCALE
  304. std::stringstream ss;
  305. ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
  306. std::out_of_range e(ss.str());
  307. #else
  308. std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
  309. #endif
  310. boost::throw_exception(e);
  311. }
  312. void extract_current()const
  313. {
  314. m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
  315. // if the last value is a high surrogate then adjust m_position and m_value as needed:
  316. if(detail::is_high_surrogate(*m_position))
  317. {
  318. // precondition; next value must have be a low-surrogate:
  319. BaseIterator next(m_position);
  320. ::boost::uint16_t t = *++next;
  321. if((t & 0xFC00u) != 0xDC00u)
  322. invalid_code_point(t);
  323. m_value = (m_value - detail::high_surrogate_base) << 10;
  324. m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
  325. }
  326. // postcondition; result must not be a surrogate:
  327. if(detail::is_surrogate(m_value))
  328. invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
  329. }
  330. BaseIterator m_position;
  331. mutable U32Type m_value;
  332. };
  333. template <class BaseIterator, class U8Type = ::boost::uint8_t>
  334. class u32_to_u8_iterator
  335. : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
  336. {
  337. typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
  338. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  339. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  340. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
  341. BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
  342. #endif
  343. public:
  344. typename base_type::reference
  345. dereference()const
  346. {
  347. if(m_current == 4)
  348. extract_current();
  349. return m_values[m_current];
  350. }
  351. bool equal(const u32_to_u8_iterator& that)const
  352. {
  353. if(m_position == that.m_position)
  354. {
  355. // either the m_current's must be equal, or one must be 0 and
  356. // the other 4: which means neither must have bits 1 or 2 set:
  357. return (m_current == that.m_current)
  358. || (((m_current | that.m_current) & 3) == 0);
  359. }
  360. return false;
  361. }
  362. void increment()
  363. {
  364. // if we have a pending read then read now, so that we know whether
  365. // to skip a position, or move to a low-surrogate:
  366. if(m_current == 4)
  367. {
  368. // pending read:
  369. extract_current();
  370. }
  371. // move to the next surrogate position:
  372. ++m_current;
  373. // if we've reached the end skip a position:
  374. if(m_values[m_current] == 0)
  375. {
  376. m_current = 4;
  377. ++m_position;
  378. }
  379. }
  380. void decrement()
  381. {
  382. if((m_current & 3) == 0)
  383. {
  384. --m_position;
  385. extract_current();
  386. m_current = 3;
  387. while(m_current && (m_values[m_current] == 0))
  388. --m_current;
  389. }
  390. else
  391. --m_current;
  392. }
  393. BaseIterator base()const
  394. {
  395. return m_position;
  396. }
  397. // construct:
  398. u32_to_u8_iterator() : m_position(), m_current(0)
  399. {
  400. m_values[0] = 0;
  401. m_values[1] = 0;
  402. m_values[2] = 0;
  403. m_values[3] = 0;
  404. m_values[4] = 0;
  405. }
  406. u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
  407. {
  408. m_values[0] = 0;
  409. m_values[1] = 0;
  410. m_values[2] = 0;
  411. m_values[3] = 0;
  412. m_values[4] = 0;
  413. }
  414. private:
  415. void extract_current()const
  416. {
  417. boost::uint32_t c = *m_position;
  418. if(c > 0x10FFFFu)
  419. detail::invalid_utf32_code_point(c);
  420. if(c < 0x80u)
  421. {
  422. m_values[0] = static_cast<unsigned char>(c);
  423. m_values[1] = static_cast<unsigned char>(0u);
  424. m_values[2] = static_cast<unsigned char>(0u);
  425. m_values[3] = static_cast<unsigned char>(0u);
  426. }
  427. else if(c < 0x800u)
  428. {
  429. m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
  430. m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  431. m_values[2] = static_cast<unsigned char>(0u);
  432. m_values[3] = static_cast<unsigned char>(0u);
  433. }
  434. else if(c < 0x10000u)
  435. {
  436. m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
  437. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  438. m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  439. m_values[3] = static_cast<unsigned char>(0u);
  440. }
  441. else
  442. {
  443. m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
  444. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  445. m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  446. m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  447. }
  448. m_current= 0;
  449. }
  450. BaseIterator m_position;
  451. mutable U8Type m_values[5];
  452. mutable unsigned m_current;
  453. };
  454. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  455. class u8_to_u32_iterator
  456. : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
  457. {
  458. typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
  459. // special values for pending iterator reads:
  460. BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
  461. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  462. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  463. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
  464. BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
  465. #endif
  466. public:
  467. typename base_type::reference
  468. dereference()const
  469. {
  470. if(m_value == pending_read)
  471. extract_current();
  472. return m_value;
  473. }
  474. bool equal(const u8_to_u32_iterator& that)const
  475. {
  476. return m_position == that.m_position;
  477. }
  478. void increment()
  479. {
  480. // We must not start with a continuation character:
  481. if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
  482. invalid_sequence();
  483. // skip high surrogate first if there is one:
  484. unsigned c = detail::utf8_byte_count(*m_position);
  485. if(m_value == pending_read)
  486. {
  487. // Since we haven't read in a value, we need to validate the code points:
  488. for(unsigned i = 0; i < c; ++i)
  489. {
  490. ++m_position;
  491. // We must have a continuation byte:
  492. if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
  493. invalid_sequence();
  494. }
  495. }
  496. else
  497. {
  498. std::advance(m_position, c);
  499. }
  500. m_value = pending_read;
  501. }
  502. void decrement()
  503. {
  504. // Keep backtracking until we don't have a trailing character:
  505. unsigned count = 0;
  506. while((*--m_position & 0xC0u) == 0x80u) ++count;
  507. // now check that the sequence was valid:
  508. if(count != detail::utf8_trailing_byte_count(*m_position))
  509. invalid_sequence();
  510. m_value = pending_read;
  511. }
  512. BaseIterator base()const
  513. {
  514. return m_position;
  515. }
  516. // construct:
  517. u8_to_u32_iterator() : m_position()
  518. {
  519. m_value = pending_read;
  520. }
  521. u8_to_u32_iterator(BaseIterator b) : m_position(b)
  522. {
  523. m_value = pending_read;
  524. }
  525. //
  526. // Checked constructor:
  527. //
  528. u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  529. {
  530. m_value = pending_read;
  531. //
  532. // We must not start with a continuation character, or end with a
  533. // truncated UTF-8 sequence otherwise we run the risk of going past
  534. // the start/end of the underlying sequence:
  535. //
  536. if(start != end)
  537. {
  538. unsigned char v = *start;
  539. if((v & 0xC0u) == 0x80u)
  540. invalid_sequence();
  541. if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
  542. invalid_sequence();
  543. BaseIterator pos = end;
  544. do
  545. {
  546. v = *--pos;
  547. }
  548. while((start != pos) && ((v & 0xC0u) == 0x80u));
  549. std::ptrdiff_t extra = detail::utf8_byte_count(v);
  550. if(std::distance(pos, end) < extra)
  551. invalid_sequence();
  552. }
  553. }
  554. private:
  555. static void invalid_sequence()
  556. {
  557. std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
  558. boost::throw_exception(e);
  559. }
  560. void extract_current()const
  561. {
  562. m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
  563. // we must not have a continuation character:
  564. if((m_value & 0xC0u) == 0x80u)
  565. invalid_sequence();
  566. // see how many extra bytes we have:
  567. unsigned extra = detail::utf8_trailing_byte_count(*m_position);
  568. // extract the extra bits, 6 from each extra byte:
  569. BaseIterator next(m_position);
  570. for(unsigned c = 0; c < extra; ++c)
  571. {
  572. ++next;
  573. m_value <<= 6;
  574. // We must have a continuation byte:
  575. if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
  576. invalid_sequence();
  577. m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
  578. }
  579. // we now need to remove a few of the leftmost bits, but how many depends
  580. // upon how many extra bytes we've extracted:
  581. static const boost::uint32_t masks[4] =
  582. {
  583. 0x7Fu,
  584. 0x7FFu,
  585. 0xFFFFu,
  586. 0x1FFFFFu,
  587. };
  588. m_value &= masks[extra];
  589. // check the result is in range:
  590. if(m_value > static_cast<U32Type>(0x10FFFFu))
  591. invalid_sequence();
  592. // The result must not be a surrogate:
  593. if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
  594. invalid_sequence();
  595. // We should not have had an invalidly encoded UTF8 sequence:
  596. if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
  597. invalid_sequence();
  598. }
  599. BaseIterator m_position;
  600. mutable U32Type m_value;
  601. };
  602. template <class BaseIterator>
  603. class utf16_output_iterator
  604. {
  605. public:
  606. typedef void difference_type;
  607. typedef void value_type;
  608. typedef boost::uint32_t* pointer;
  609. typedef boost::uint32_t& reference;
  610. typedef std::output_iterator_tag iterator_category;
  611. utf16_output_iterator(const BaseIterator& b)
  612. : m_position(b){}
  613. utf16_output_iterator(const utf16_output_iterator& that)
  614. : m_position(that.m_position){}
  615. utf16_output_iterator& operator=(const utf16_output_iterator& that)
  616. {
  617. m_position = that.m_position;
  618. return *this;
  619. }
  620. const utf16_output_iterator& operator*()const
  621. {
  622. return *this;
  623. }
  624. void operator=(boost::uint32_t val)const
  625. {
  626. push(val);
  627. }
  628. utf16_output_iterator& operator++()
  629. {
  630. return *this;
  631. }
  632. utf16_output_iterator& operator++(int)
  633. {
  634. return *this;
  635. }
  636. BaseIterator base()const
  637. {
  638. return m_position;
  639. }
  640. private:
  641. void push(boost::uint32_t v)const
  642. {
  643. if(v >= 0x10000u)
  644. {
  645. // begin by checking for a code point out of range:
  646. if(v > 0x10FFFFu)
  647. detail::invalid_utf32_code_point(v);
  648. // split into two surrogates:
  649. *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
  650. *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  651. }
  652. else
  653. {
  654. // 16-bit code point:
  655. // value must not be a surrogate:
  656. if(detail::is_surrogate(v))
  657. detail::invalid_utf32_code_point(v);
  658. *m_position++ = static_cast<boost::uint16_t>(v);
  659. }
  660. }
  661. mutable BaseIterator m_position;
  662. };
  663. template <class BaseIterator>
  664. class utf8_output_iterator
  665. {
  666. public:
  667. typedef void difference_type;
  668. typedef void value_type;
  669. typedef boost::uint32_t* pointer;
  670. typedef boost::uint32_t& reference;
  671. typedef std::output_iterator_tag iterator_category;
  672. utf8_output_iterator(const BaseIterator& b)
  673. : m_position(b){}
  674. utf8_output_iterator(const utf8_output_iterator& that)
  675. : m_position(that.m_position){}
  676. utf8_output_iterator& operator=(const utf8_output_iterator& that)
  677. {
  678. m_position = that.m_position;
  679. return *this;
  680. }
  681. const utf8_output_iterator& operator*()const
  682. {
  683. return *this;
  684. }
  685. void operator=(boost::uint32_t val)const
  686. {
  687. push(val);
  688. }
  689. utf8_output_iterator& operator++()
  690. {
  691. return *this;
  692. }
  693. utf8_output_iterator& operator++(int)
  694. {
  695. return *this;
  696. }
  697. BaseIterator base()const
  698. {
  699. return m_position;
  700. }
  701. private:
  702. void push(boost::uint32_t c)const
  703. {
  704. if(c > 0x10FFFFu)
  705. detail::invalid_utf32_code_point(c);
  706. if(c < 0x80u)
  707. {
  708. *m_position++ = static_cast<unsigned char>(c);
  709. }
  710. else if(c < 0x800u)
  711. {
  712. *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
  713. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  714. }
  715. else if(c < 0x10000u)
  716. {
  717. *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
  718. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  719. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  720. }
  721. else
  722. {
  723. *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
  724. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  725. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  726. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  727. }
  728. }
  729. mutable BaseIterator m_position;
  730. };
  731. } // namespace boost
  732. #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP