regex_token_iterator.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. ///////////////////////////////////////////////////////////////////////////////
  2. /// \file regex_token_iterator.hpp
  3. /// Contains the definition of regex_token_iterator, and STL-compatible iterator
  4. /// for tokenizing a string using a regular expression.
  5. //
  6. // Copyright 2008 Eric Niebler. Distributed under the Boost
  7. // Software License, Version 1.0. (See accompanying file
  8. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. #ifndef BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
  10. #define BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
  11. // MS compatible compilers support #pragma once
  12. #if defined(_MSC_VER)
  13. # pragma once
  14. #endif
  15. #include <vector>
  16. #include <boost/assert.hpp>
  17. #include <boost/mpl/assert.hpp>
  18. #include <boost/type_traits/is_same.hpp>
  19. #include <boost/type_traits/is_convertible.hpp>
  20. #include <boost/xpressive/regex_iterator.hpp>
  21. namespace boost { namespace xpressive { namespace detail
  22. {
  23. //////////////////////////////////////////////////////////////////////////
  24. // regex_token_iterator_impl
  25. //
  26. template<typename BidiIter>
  27. struct regex_token_iterator_impl
  28. : counted_base<regex_token_iterator_impl<BidiIter> >
  29. {
  30. typedef sub_match<BidiIter> value_type;
  31. regex_token_iterator_impl
  32. (
  33. BidiIter begin
  34. , BidiIter cur
  35. , BidiIter end
  36. , BidiIter next_search
  37. , basic_regex<BidiIter> const &rex
  38. , regex_constants::match_flag_type flags = regex_constants::match_default
  39. , std::vector<int> subs = std::vector<int>(1, 0)
  40. , int n = -2
  41. , bool not_null = false
  42. )
  43. : iter_(begin, cur, end, next_search, rex, flags, not_null)
  44. , result_()
  45. , n_((-2 == n) ? (int)subs.size() - 1 : n)
  46. , subs_()
  47. {
  48. BOOST_ASSERT(0 != subs.size());
  49. this->subs_.swap(subs);
  50. }
  51. bool next()
  52. {
  53. if(-1 != this->n_)
  54. {
  55. BidiIter cur = this->iter_.state_.cur_;
  56. if(0 != (++this->n_ %= (int)this->subs_.size()) || this->iter_.next())
  57. {
  58. this->result_ = (-1 == this->subs_[ this->n_ ])
  59. ? this->iter_.what_.prefix()
  60. : this->iter_.what_[ this->subs_[ this->n_ ] ];
  61. return true;
  62. }
  63. else if(-1 == this->subs_[ this->n_-- ] && cur != this->iter_.state_.end_)
  64. {
  65. this->result_ = value_type(cur, this->iter_.state_.end_, true);
  66. return true;
  67. }
  68. }
  69. return false;
  70. }
  71. bool equal_to(regex_token_iterator_impl<BidiIter> const &that) const
  72. {
  73. return this->iter_.equal_to(that.iter_) && this->n_ == that.n_;
  74. }
  75. regex_iterator_impl<BidiIter> iter_;
  76. value_type result_;
  77. int n_;
  78. std::vector<int> subs_;
  79. };
  80. inline int get_mark_number(int i)
  81. {
  82. return i;
  83. }
  84. inline std::vector<int> to_vector(int subs)
  85. {
  86. return std::vector<int>(1, subs);
  87. }
  88. inline std::vector<int> const &to_vector(std::vector<int> const &subs)
  89. {
  90. return subs;
  91. }
  92. template<typename Int, std::size_t Size>
  93. inline std::vector<int> to_vector(Int const (&sub_matches)[ Size ])
  94. {
  95. // so that people can specify sub-match indices inline with
  96. // string literals, like "\1\2\3", leave off the trailing '\0'
  97. std::size_t const size = Size - is_same<Int, char>::value;
  98. std::vector<int> vect(size);
  99. for(std::size_t i = 0; i < size; ++i)
  100. {
  101. vect[i] = get_mark_number(sub_matches[i]);
  102. }
  103. return vect;
  104. }
  105. template<typename Int>
  106. inline std::vector<int> to_vector(std::vector<Int> const &sub_matches)
  107. {
  108. BOOST_MPL_ASSERT((is_convertible<Int, int>));
  109. return std::vector<int>(sub_matches.begin(), sub_matches.end());
  110. }
  111. } // namespace detail
  112. //////////////////////////////////////////////////////////////////////////
  113. // regex_token_iterator
  114. //
  115. template<typename BidiIter>
  116. struct regex_token_iterator
  117. {
  118. typedef basic_regex<BidiIter> regex_type;
  119. typedef typename iterator_value<BidiIter>::type char_type;
  120. typedef sub_match<BidiIter> value_type;
  121. typedef std::ptrdiff_t difference_type;
  122. typedef value_type const *pointer;
  123. typedef value_type const &reference;
  124. typedef std::forward_iterator_tag iterator_category;
  125. /// INTERNAL ONLY
  126. typedef detail::regex_token_iterator_impl<BidiIter> impl_type_;
  127. /// \post \c *this is the end of sequence iterator.
  128. regex_token_iterator()
  129. : impl_()
  130. {
  131. }
  132. /// \param begin The beginning of the character range to search.
  133. /// \param end The end of the character range to search.
  134. /// \param rex The regex pattern to search for.
  135. /// \pre \c [begin,end) is a valid range.
  136. regex_token_iterator
  137. (
  138. BidiIter begin
  139. , BidiIter end
  140. , basic_regex<BidiIter> const &rex
  141. )
  142. : impl_()
  143. {
  144. if(0 != rex.regex_id())
  145. {
  146. this->impl_ = new impl_type_(begin, begin, end, begin, rex);
  147. this->next_();
  148. }
  149. }
  150. /// \param begin The beginning of the character range to search.
  151. /// \param end The end of the character range to search.
  152. /// \param rex The regex pattern to search for.
  153. /// \param args A let() expression with argument bindings for semantic actions.
  154. /// \pre \c [begin,end) is a valid range.
  155. template<typename LetExpr>
  156. regex_token_iterator
  157. (
  158. BidiIter begin
  159. , BidiIter end
  160. , basic_regex<BidiIter> const &rex
  161. , detail::let_<LetExpr> const &args
  162. )
  163. : impl_()
  164. {
  165. if(0 != rex.regex_id())
  166. {
  167. this->impl_ = new impl_type_(begin, begin, end, begin, rex);
  168. detail::bind_args(args, this->impl_->iter_.what_);
  169. this->next_();
  170. }
  171. }
  172. /// \param begin The beginning of the character range to search.
  173. /// \param end The end of the character range to search.
  174. /// \param rex The regex pattern to search for.
  175. /// \param subs A range of integers designating sub-matches to be treated as tokens.
  176. /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
  177. /// \pre \c [begin,end) is a valid range.
  178. /// \pre \c subs is either an integer greater or equal to -1,
  179. /// or else an array or non-empty \c std::vector\<\> of such integers.
  180. template<typename Subs>
  181. regex_token_iterator
  182. (
  183. BidiIter begin
  184. , BidiIter end
  185. , basic_regex<BidiIter> const &rex
  186. , Subs const &subs
  187. , regex_constants::match_flag_type flags = regex_constants::match_default
  188. )
  189. : impl_()
  190. {
  191. if(0 != rex.regex_id())
  192. {
  193. this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
  194. this->next_();
  195. }
  196. }
  197. /// \param begin The beginning of the character range to search.
  198. /// \param end The end of the character range to search.
  199. /// \param rex The regex pattern to search for.
  200. /// \param subs A range of integers designating sub-matches to be treated as tokens.
  201. /// \param args A let() expression with argument bindings for semantic actions.
  202. /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
  203. /// \pre \c [begin,end) is a valid range.
  204. /// \pre \c subs is either an integer greater or equal to -1,
  205. /// or else an array or non-empty \c std::vector\<\> of such integers.
  206. template<typename Subs, typename LetExpr>
  207. regex_token_iterator
  208. (
  209. BidiIter begin
  210. , BidiIter end
  211. , basic_regex<BidiIter> const &rex
  212. , Subs const &subs
  213. , detail::let_<LetExpr> const &args
  214. , regex_constants::match_flag_type flags = regex_constants::match_default
  215. )
  216. : impl_()
  217. {
  218. if(0 != rex.regex_id())
  219. {
  220. this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
  221. detail::bind_args(args, this->impl_->iter_.what_);
  222. this->next_();
  223. }
  224. }
  225. /// \post <tt>*this == that</tt>
  226. regex_token_iterator(regex_token_iterator<BidiIter> const &that)
  227. : impl_(that.impl_) // COW
  228. {
  229. }
  230. /// \post <tt>*this == that</tt>
  231. regex_token_iterator<BidiIter> &operator =(regex_token_iterator<BidiIter> const &that)
  232. {
  233. this->impl_ = that.impl_; // COW
  234. return *this;
  235. }
  236. friend bool operator ==(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
  237. {
  238. if(!left.impl_ || !right.impl_)
  239. {
  240. return !left.impl_ && !right.impl_;
  241. }
  242. return left.impl_->equal_to(*right.impl_);
  243. }
  244. friend bool operator !=(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
  245. {
  246. return !(left == right);
  247. }
  248. value_type const &operator *() const
  249. {
  250. return this->impl_->result_;
  251. }
  252. value_type const *operator ->() const
  253. {
  254. return &this->impl_->result_;
  255. }
  256. /// If N == -1 then sets *this equal to the end of sequence iterator.
  257. /// Otherwise if N+1 \< subs.size(), then increments N and sets result equal to
  258. /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
  259. /// Otherwise if what.prefix().first != what[0].second and if the element match_prev_avail is
  260. /// not set in flags then sets it. Then locates the next match as if by calling
  261. /// regex_search(what[0].second, end, what, *pre, flags), with the following variation:
  262. /// in the event that the previous match found was of zero length (what[0].length() == 0)
  263. /// then attempts to find a non-zero length match starting at what[0].second, only if that
  264. /// fails and provided what[0].second != suffix().second does it look for a (possibly zero
  265. /// length) match starting from what[0].second + 1. If such a match is found then sets N
  266. /// equal to zero, and sets result equal to
  267. /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
  268. /// Otherwise if no further matches were found, then let last_end be the endpoint of the last
  269. /// match that was found. Then if last_end != end and subs[0] == -1 sets N equal to -1 and
  270. /// sets result equal to value_type(last_end, end). Otherwise sets *this equal to the end
  271. /// of sequence iterator.
  272. regex_token_iterator<BidiIter> &operator ++()
  273. {
  274. this->fork_(); // un-share the implementation
  275. this->next_();
  276. return *this;
  277. }
  278. regex_token_iterator<BidiIter> operator ++(int)
  279. {
  280. regex_token_iterator<BidiIter> tmp(*this);
  281. ++*this;
  282. return tmp;
  283. }
  284. private:
  285. /// INTERNAL ONLY
  286. void fork_()
  287. {
  288. if(1 != this->impl_->use_count())
  289. {
  290. intrusive_ptr<impl_type_> clone = new impl_type_
  291. (
  292. this->impl_->iter_.state_.begin_
  293. , this->impl_->iter_.state_.cur_
  294. , this->impl_->iter_.state_.end_
  295. , this->impl_->iter_.state_.next_search_
  296. , this->impl_->iter_.rex_
  297. , this->impl_->iter_.flags_
  298. , this->impl_->subs_
  299. , this->impl_->n_
  300. , this->impl_->iter_.not_null_
  301. );
  302. // only copy the match_results struct if we have to. Note: if the next call
  303. // to impl_->next() will return false or call regex_search, we don't need to
  304. // copy the match_results struct.
  305. if(-1 != this->impl_->n_ && this->impl_->n_ + 1 != static_cast<int>(this->impl_->subs_.size()))
  306. {
  307. // BUGBUG This is expensive -- it causes the sequence_stack to be cleared.
  308. // Find a better way
  309. clone->iter_.what_ = this->impl_->iter_.what_;
  310. }
  311. else
  312. {
  313. // At the very least, copy the action args
  314. detail::core_access<BidiIter>::get_action_args(clone->iter_.what_)
  315. = detail::core_access<BidiIter>::get_action_args(this->impl_->iter_.what_);
  316. }
  317. this->impl_.swap(clone);
  318. }
  319. }
  320. /// INTERNAL ONLY
  321. void next_()
  322. {
  323. BOOST_ASSERT(this->impl_ && 1 == this->impl_->use_count());
  324. if(!this->impl_->next())
  325. {
  326. this->impl_ = 0;
  327. }
  328. }
  329. intrusive_ptr<impl_type_> impl_;
  330. };
  331. }} // namespace boost::xpressive
  332. #endif