regex_token_iterator.qbk 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. [/
  2. Copyright 2006-2007 John Maddock.
  3. Distributed under the Boost Software License, Version 1.0.
  4. (See accompanying file LICENSE_1_0.txt or copy at
  5. http://www.boost.org/LICENSE_1_0.txt).
  6. ]
  7. [section:regex_token_iterator regex_token_iterator]
  8. The template class [regex_token_iterator] is an iterator adapter; that is to
  9. say it represents a new view of an existing iterator sequence,
  10. by enumerating all the occurrences of a regular expression within that
  11. sequence, and presenting one or more character sequence for each match found.
  12. Each position enumerated by the iterator is a [sub_match] object that represents
  13. what matched a particular sub-expression within the regular expression.
  14. When class [regex_token_iterator] is used to enumerate a single sub-expression
  15. with index -1, then the iterator performs field splitting: that is
  16. to say it enumerates one character sequence for each section of the character
  17. container sequence that does not match the regular expression specified.
  18. template <class BidirectionalIterator,
  19. class charT = iterator_traits<BidirectionalIterator>::value_type,
  20. class traits = regex_traits<charT> >
  21. class regex_token_iterator
  22. {
  23. public:
  24. typedef basic_regex<charT, traits> regex_type;
  25. typedef sub_match<BidirectionalIterator> value_type;
  26. typedef typename iterator_traits<BidirectionalIterator>::difference_type difference_type;
  27. typedef const value_type* pointer;
  28. typedef const value_type& reference;
  29. typedef std::forward_iterator_tag iterator_category;
  30. ``[link boost_regex.regex_token_iterator.construct1 regex_token_iterator]``();
  31. ``[link boost_regex.regex_token_iterator.construct2 regex_token_iterator]``(BidirectionalIterator a,
  32. BidirectionalIterator b,
  33. const regex_type& re,
  34. int submatch = 0,
  35. match_flag_type m = match_default);
  36. ``[link boost_regex.regex_token_iterator.construct3 regex_token_iterator]``(BidirectionalIterator a,
  37. BidirectionalIterator b,
  38. const regex_type& re,
  39. const std::vector<int>& submatches,
  40. match_flag_type m = match_default);
  41. template <std::size_t N>
  42. ``[link boost_regex.regex_token_iterator.construct4 regex_token_iterator]``(BidirectionalIterator a,
  43. BidirectionalIterator b,
  44. const regex_type& re,
  45. const int (&submatches)[N],
  46. match_flag_type m = match_default);
  47. ``[link boost_regex.regex_token_iterator.construct5 regex_token_iterator]``(const regex_token_iterator&);
  48. regex_token_iterator& ``[link boost_regex.regex_token_iterator.assign operator=]``(const regex_token_iterator&);
  49. bool ``[link boost_regex.regex_token_iterator.op_eq operator==]``(const regex_token_iterator&)const;
  50. bool ``[link boost_regex.regex_token_iterator.op_ne operator!=]``(const regex_token_iterator&)const;
  51. const value_type& ``[link boost_regex.regex_token_iterator.op_deref operator*]``()const;
  52. const value_type* ``[link boost_regex.regex_token_iterator.op_arrow operator->]``()const;
  53. regex_token_iterator& ``[link boost_regex.regex_token_iterator.op_inc1 operator++]``();
  54. regex_token_iterator ``[link boost_regex.regex_token_iterator.op_inc2 operator++]``(int);
  55. };
  56. typedef regex_token_iterator<const char*> cregex_token_iterator;
  57. typedef regex_token_iterator<std::string::const_iterator> sregex_token_iterator;
  58. #ifndef BOOST_NO_WREGEX
  59. typedef regex_token_iterator<const wchar_t*> wcregex_token_iterator;
  60. typedef regex_token_iterator<<std::wstring::const_iterator> wsregex_token_iterator;
  61. #endif
  62. template <class charT, class traits>
  63. regex_token_iterator<const charT*, charT, traits>
  64. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  65. const charT* p,
  66. const basic_regex<charT, traits>& e,
  67. int submatch = 0,
  68. regex_constants::match_flag_type m = regex_constants::match_default);
  69. template <class charT, class traits, class ST, class SA>
  70. regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  71. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  72. const std::basic_string<charT, ST, SA>& p,
  73. const basic_regex<charT, traits>& e,
  74. int submatch = 0,
  75. regex_constants::match_flag_type m = regex_constants::match_default);
  76. template <class charT, class traits, std::size_t N>
  77. regex_token_iterator<const charT*, charT, traits>
  78. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  79. const charT* p,
  80. const basic_regex<charT, traits>& e,
  81. const int (&submatch)[N],
  82. regex_constants::match_flag_type m = regex_constants::match_default);
  83. template <class charT, class traits, class ST, class SA, std::size_t N>
  84. regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  85. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  86. const std::basic_string<charT, ST, SA>& p,
  87. const basic_regex<charT, traits>& e,
  88. const int (&submatch)[N],
  89. regex_constants::match_flag_type m = regex_constants::match_default);
  90. template <class charT, class traits>
  91. regex_token_iterator<const charT*, charT, traits>
  92. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  93. const charT* p,
  94. const basic_regex<charT, traits>& e,
  95. const std::vector<int>& submatch,
  96. regex_constants::match_flag_type m = regex_constants::match_default);
  97. template <class charT, class traits, class ST, class SA>
  98. regex_token_iterator<
  99. typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  100. ``[link boost_regex.regex_token_iterator.make make_regex_token_iterator]``(
  101. const std::basic_string<charT, ST, SA>& p,
  102. const basic_regex<charT, traits>& e,
  103. const std::vector<int>& submatch,
  104. regex_constants::match_flag_type m = regex_constants::match_default);
  105. [h4 Description]
  106. [#boost_regex.regex_token_iterator.construct1]
  107. regex_token_iterator();
  108. [*Effects]: constructs an end of sequence iterator.
  109. [#boost_regex.regex_token_iterator.construct2]
  110. regex_token_iterator(BidirectionalIterator a,
  111. BidirectionalIterator b,
  112. const regex_type& re,
  113. int submatch = 0,
  114. match_flag_type m = match_default);
  115. [*Preconditions]: `!re.empty()`. Object /re/ shall exist for the lifetime of
  116. the iterator constructed from it.
  117. [*Effects]: constructs a [regex_token_iterator] that will enumerate one string for
  118. each regular expression match of the expression /re/ found within the sequence \[a,b),
  119. using match flags /m/ (see [match_flag_type]). The string enumerated is the sub-expression /submatch/
  120. for each match found; if /submatch/ is -1, then enumerates all the text
  121. sequences that did not match the expression /re/ (that is to performs field
  122. splitting).
  123. [*Throws]: `std::runtime_error` if the complexity of matching the expression against
  124. an N character string begins to exceed O(N[super 2]), or if the program runs
  125. out of stack space while matching the expression (if Boost.Regex is configured
  126. in recursive mode), or if the matcher exhausts its permitted memory
  127. allocation (if Boost.Regex is configured in non-recursive mode).
  128. [#boost_regex.regex_token_iterator.construct3]
  129. regex_token_iterator(BidirectionalIterator a,
  130. BidirectionalIterator b,
  131. const regex_type& re,
  132. const std::vector<int>& submatches,
  133. match_flag_type m = match_default);
  134. [*Preconditions]: `submatches.size() && !re.empty()`. Object /re/ shall
  135. exist for the lifetime of the iterator constructed from it.
  136. [*Effects]: constructs a [regex_token_iterator] that will enumerate
  137. `submatches.size()` strings for each regular expression match of
  138. the expression /re/ found within the sequence \[a,b), using match flags /m/
  139. (see [match_flag_type]). For each match found one string will be enumerated
  140. for each sub-expression index contained within submatches vector; if
  141. `submatches[0]` is -1, then the first string enumerated for each match will be
  142. all of the text from end of the last match to the start of the current match,
  143. in addition there will be one extra string enumerated when no more matches can
  144. be found: from the end of the last match found, to the end of the underlying sequence.
  145. [*Throws]: `std::runtime_error` if the complexity of matching the expression
  146. against an N character string begins to exceed O(N[super 2]), or if the
  147. program runs out of stack space while matching the expression (if Boost.Regex is
  148. configured in recursive mode), or if the matcher exhausts its permitted memory
  149. allocation (if Boost.Regex is configured in non-recursive mode).
  150. [#boost_regex.regex_token_iterator.construct4]
  151. template <std::size_t N>
  152. regex_token_iterator(BidirectionalIterator a,
  153. BidirectionalIterator b,
  154. const regex_type& re,
  155. const int (&submatches)[R],
  156. match_flag_type m = match_default);
  157. [*Preconditions]: `!re.empty()`. Object /re/ shall exist for the lifetime of the iterator constructed from it.
  158. [*Effects]: constructs a [regex_token_iterator] that will enumerate /R/ strings
  159. for each regular expression match of the expression /re/ found within the sequence
  160. \[a,b), using match flags /m/ (see [match_flag_type]). For each match found one
  161. string will be enumerated for each sub-expression index contained within the
  162. /submatches/ array; if `submatches[0]` is -1, then the first string enumerated for
  163. each match will be all of the text from end of the last match to the start
  164. of the current match, in addition there will be one extra string enumerated when
  165. no more matches can be found: from the end of the last match found, to
  166. the end of the underlying sequence.
  167. [*Throws]: `std::runtime_error` if the complexity of matching the expression
  168. against an N character string begins to exceed O(N[super 2]), or if the
  169. program runs out of stack space while matching the expression (if Boost.Regex
  170. is configured in recursive mode), or if the matcher exhausts its
  171. permitted memory allocation (if Boost.Regex is configured in non-recursive mode).
  172. [#boost_regex.regex_token_iterator.construct5]
  173. regex_token_iterator(const regex_token_iterator& that);
  174. [*Effects]: constructs a copy of `that`.
  175. [*Postconditions]: `*this == that`.
  176. [#boost_regex.regex_token_iterator.assign]
  177. regex_token_iterator& operator=(const regex_token_iterator& that);
  178. [*Effects]: sets `*this` to be equal to `that`.
  179. [*Postconditions]: `*this == that`.
  180. [#boost_regex.regex_token_iterator.op_eq]
  181. bool operator==(const regex_token_iterator&)const;
  182. [*Effects]: returns true if `*this` is the same position as `that`.
  183. [#boost_regex.regex_token_iterator.op_ne]
  184. bool operator!=(const regex_token_iterator&)const;
  185. [*Effects]: returns `!(*this == that)`.
  186. [#boost_regex.regex_token_iterator.op_deref]
  187. const value_type& operator*()const;
  188. [*Effects]: returns the current character sequence being enumerated.
  189. [#boost_regex.regex_token_iterator.op_arrow]
  190. const value_type* operator->()const;
  191. [*Effects]: returns `&(*this)`.
  192. [#boost_regex.regex_token_iterator.op_inc1]
  193. regex_token_iterator& operator++();
  194. [*Effects]: Moves on to the next character sequence to be enumerated.
  195. [*Throws]: `std::runtime_error` if the complexity of matching the expression
  196. against an N character string begins to exceed O(N[super 2]), or if the program
  197. runs out of stack space while matching the expression (if Boost.Regex is
  198. configured in recursive mode), or if the matcher exhausts its permitted
  199. memory allocation (if Boost.Regex is configured in non-recursive mode).
  200. [*Returns]: `*this`.
  201. [#boost_regex.regex_token_iterator.op_inc2]
  202. regex_token_iterator& operator++(int);
  203. [*Effects]: constructs a copy result of `*this`, then calls `++(*this)`.
  204. [*Returns]: result.
  205. [#boost_regex.regex_token_iterator.make]
  206. template <class charT, class traits>
  207. regex_token_iterator<const charT*, charT, traits>
  208. make_regex_token_iterator(
  209. const charT* p,
  210. const basic_regex<charT, traits>& e,
  211. int submatch = 0,
  212. regex_constants::match_flag_type m = regex_constants::match_default);
  213. template <class charT, class traits, class ST, class SA>
  214. regex_token_iterator<typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  215. make_regex_token_iterator(
  216. const std::basic_string<charT, ST, SA>& p,
  217. const basic_regex<charT, traits>& e,
  218. int submatch = 0,
  219. regex_constants::match_flag_type m = regex_constants::match_default);
  220. template <class charT, class traits, std::size_t N>
  221. regex_token_iterator<const charT*, charT, traits>
  222. make_regex_token_iterator(
  223. const charT* p,
  224. const basic_regex<charT, traits>& e,
  225. const int (&submatch)[N],
  226. regex_constants::match_flag_type m = regex_constants::match_default);
  227. template <class charT, class traits, class ST, class SA, std::size_t N>
  228. regex_token_iterator<
  229. typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  230. make_regex_token_iterator(
  231. const std::basic_string<charT, ST, SA>& p,
  232. const basic_regex<charT, traits>& e,
  233. const int (&submatch)[N],
  234. regex_constants::match_flag_type m = regex_constants::match_default);
  235. template <class charT, class traits>
  236. regex_token_iterator<const charT*, charT, traits>
  237. make_regex_token_iterator(
  238. const charT* p,
  239. const basic_regex<charT, traits>& e,
  240. const std::vector<int>& submatch,
  241. regex_constants::match_flag_type m = regex_constants::match_default);
  242. template <class charT, class traits, class ST, class SA>
  243. regex_token_iterator<
  244. typename std::basic_string<charT, ST, SA>::const_iterator, charT, traits>
  245. make_regex_token_iterator(
  246. const std::basic_string<charT, ST, SA>& p,
  247. const basic_regex<charT, traits>& e,
  248. const std::vector<int>& submatch,
  249. regex_constants::match_flag_type m = regex_constants::match_default);
  250. [*Effects]: returns a [regex_token_iterator] that enumerates one [sub_match]
  251. for each value in /submatch/ for each occurrence of regular expression /e/
  252. in string /p/, matched using [match_flag_type] /m/.
  253. [h4 Examples]
  254. The following example takes a string and splits it into a series of tokens:
  255. #include <iostream>
  256. #include <boost/regex.hpp>
  257. using namespace std;
  258. int main(int argc)
  259. {
  260. string s;
  261. do{
  262. if(argc == 1)
  263. {
  264. cout << "Enter text to split (or \"quit\" to exit): ";
  265. getline(cin, s);
  266. if(s == "quit") break;
  267. }
  268. else
  269. s = "This is a string of tokens";
  270. boost::regex re("\\s+");
  271. boost::sregex_token_iterator i(s.begin(), s.end(), re, -1);
  272. boost::sregex_token_iterator j;
  273. unsigned count = 0;
  274. while(i != j)
  275. {
  276. cout << *i++ << endl;
  277. count++;
  278. }
  279. cout << "There were " << count << " tokens found." << endl;
  280. }while(argc == 1);
  281. return 0;
  282. }
  283. The following example takes a html file and outputs a list of all the linked files:
  284. #include <fstream>
  285. #include <iostream>
  286. #include <iterator>
  287. #include <boost/regex.hpp>
  288. boost::regex e("<\\s*A\\s+[^>]*href\\s*=\\s*\"([^\"]*)\"",
  289. boost::regex::normal | boost::regbase::icase);
  290. void load_file(std::string& s, std::istream& is)
  291. {
  292. s.erase();
  293. //
  294. // attempt to grow string buffer to match file size,
  295. // this doesn't always work...
  296. s.reserve(is.rdbuf()->in_avail());
  297. char c;
  298. while(is.get(c))
  299. {
  300. // use logarithmic growth strategy, in case
  301. // in_avail (above) returned zero:
  302. if(s.capacity() == s.size())
  303. s.reserve(s.capacity() * 3);
  304. s.append(1, c);
  305. }
  306. }
  307. int main(int argc, char** argv)
  308. {
  309. std::string s;
  310. int i;
  311. for(i = 1; i < argc; ++i)
  312. {
  313. std::cout << "Findings URL's in " << argv[i] << ":" << std::endl;
  314. s.erase();
  315. std::ifstream is(argv[i]);
  316. load_file(s, is);
  317. boost::sregex_token_iterator i(s.begin(), s.end(), e, 1);
  318. boost::sregex_token_iterator j;
  319. while(i != j)
  320. {
  321. std::cout << *i++ << std::endl;
  322. }
  323. }
  324. //
  325. // alternative method:
  326. // test the array-literal constructor, and split out the whole
  327. // match as well as $1....
  328. //
  329. for(i = 1; i < argc; ++i)
  330. {
  331. std::cout << "Findings URL's in " << argv[i] << ":" << std::endl;
  332. s.erase();
  333. std::ifstream is(argv[i]);
  334. load_file(s, is);
  335. const int subs[] = {1, 0,};
  336. boost::sregex_token_iterator i(s.begin(), s.end(), e, subs);
  337. boost::sregex_token_iterator j;
  338. while(i != j)
  339. {
  340. std::cout << *i++ << std::endl;
  341. }
  342. }
  343. return 0;
  344. }
  345. [endsect]