knuth_morris_pratt.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. /*
  2. Copyright (c) Marshall Clow 2010-2012.
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. For more information, see http://www.boost.org
  6. */
  7. #ifndef BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP
  8. #define BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP
  9. #include <vector>
  10. #include <iterator> // for std::iterator_traits
  11. #include <boost/config.hpp>
  12. #include <boost/assert.hpp>
  13. #include <boost/static_assert.hpp>
  14. #include <boost/range/begin.hpp>
  15. #include <boost/range/end.hpp>
  16. #include <boost/utility/enable_if.hpp>
  17. #include <boost/type_traits/is_same.hpp>
  18. #include <boost/algorithm/searching/detail/debugging.hpp>
  19. // #define BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_DEBUG
  20. namespace boost { namespace algorithm {
  21. // #define NEW_KMP
  22. /*
  23. A templated version of the Knuth-Morris-Pratt searching algorithm.
  24. Requirements:
  25. * Random-access iterators
  26. * The two iterator types (I1 and I2) must "point to" the same underlying type.
  27. http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
  28. http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/kmpen.htm
  29. */
  30. template <typename patIter>
  31. class knuth_morris_pratt {
  32. typedef typename std::iterator_traits<patIter>::difference_type difference_type;
  33. public:
  34. knuth_morris_pratt ( patIter first, patIter last )
  35. : pat_first ( first ), pat_last ( last ),
  36. k_pattern_length ( std::distance ( pat_first, pat_last )),
  37. skip_ ( k_pattern_length + 1 ) {
  38. #ifdef NEW_KMP
  39. preKmp ( pat_first, pat_last );
  40. #else
  41. init_skip_table ( pat_first, pat_last );
  42. #endif
  43. #ifdef BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_DEBUG
  44. detail::PrintTable ( skip_.begin (), skip_.end ());
  45. #endif
  46. }
  47. ~knuth_morris_pratt () {}
  48. /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
  49. /// \brief Searches the corpus for the pattern that was passed into the constructor
  50. ///
  51. /// \param corpus_first The start of the data to search (Random Access Iterator)
  52. /// \param corpus_last One past the end of the data to search
  53. /// \param p A predicate used for the search comparisons.
  54. ///
  55. template <typename corpusIter>
  56. std::pair<corpusIter, corpusIter>
  57. operator () ( corpusIter corpus_first, corpusIter corpus_last ) const {
  58. BOOST_STATIC_ASSERT (( boost::is_same<
  59. typename std::iterator_traits<patIter>::value_type,
  60. typename std::iterator_traits<corpusIter>::value_type>::value ));
  61. if ( corpus_first == corpus_last ) return std::make_pair(corpus_last, corpus_last); // if nothing to search, we didn't find it!
  62. if ( pat_first == pat_last ) return std::make_pair(corpus_first, corpus_first); // empty pattern matches at start
  63. const difference_type k_corpus_length = std::distance ( corpus_first, corpus_last );
  64. // If the pattern is larger than the corpus, we can't find it!
  65. if ( k_corpus_length < k_pattern_length )
  66. return std::make_pair(corpus_last, corpus_last);
  67. return do_search ( corpus_first, corpus_last, k_corpus_length );
  68. }
  69. template <typename Range>
  70. std::pair<typename boost::range_iterator<Range>::type, typename boost::range_iterator<Range>::type>
  71. operator () ( Range &r ) const {
  72. return (*this) (boost::begin(r), boost::end(r));
  73. }
  74. private:
  75. /// \cond DOXYGEN_HIDE
  76. patIter pat_first, pat_last;
  77. const difference_type k_pattern_length;
  78. std::vector <difference_type> skip_;
  79. /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
  80. /// \brief Searches the corpus for the pattern that was passed into the constructor
  81. ///
  82. /// \param corpus_first The start of the data to search (Random Access Iterator)
  83. /// \param corpus_last One past the end of the data to search
  84. /// \param p A predicate used for the search comparisons.
  85. ///
  86. template <typename corpusIter>
  87. std::pair<corpusIter, corpusIter>
  88. do_search ( corpusIter corpus_first, corpusIter corpus_last,
  89. difference_type k_corpus_length ) const {
  90. difference_type match_start = 0; // position in the corpus that we're matching
  91. #ifdef NEW_KMP
  92. int patternIdx = 0;
  93. while ( match_start < k_corpus_length ) {
  94. while ( patternIdx > -1 && pat_first[patternIdx] != corpus_first [match_start] )
  95. patternIdx = skip_ [patternIdx]; //<--- Shifting the pattern on mismatch
  96. patternIdx++;
  97. match_start++; //<--- corpus is always increased by 1
  98. if ( patternIdx >= (int) k_pattern_length )
  99. return corpus_first + match_start - patternIdx;
  100. }
  101. #else
  102. // At this point, we know:
  103. // k_pattern_length <= k_corpus_length
  104. // for all elements of skip, it holds -1 .. k_pattern_length
  105. //
  106. // In the loop, we have the following invariants
  107. // idx is in the range 0 .. k_pattern_length
  108. // match_start is in the range 0 .. k_corpus_length - k_pattern_length + 1
  109. const difference_type last_match = k_corpus_length - k_pattern_length;
  110. difference_type idx = 0; // position in the pattern we're comparing
  111. while ( match_start <= last_match ) {
  112. while ( pat_first [ idx ] == corpus_first [ match_start + idx ] ) {
  113. if ( ++idx == k_pattern_length )
  114. return std::make_pair(corpus_first + match_start, corpus_first + match_start + k_pattern_length);
  115. }
  116. // Figure out where to start searching again
  117. // assert ( idx - skip_ [ idx ] > 0 ); // we're always moving forward
  118. match_start += idx - skip_ [ idx ];
  119. idx = skip_ [ idx ] >= 0 ? skip_ [ idx ] : 0;
  120. // assert ( idx >= 0 && idx < k_pattern_length );
  121. }
  122. #endif
  123. // We didn't find anything
  124. return std::make_pair(corpus_last, corpus_last);
  125. }
  126. void preKmp ( patIter first, patIter last ) {
  127. const difference_type count = std::distance ( first, last );
  128. difference_type i, j;
  129. i = 0;
  130. j = skip_[0] = -1;
  131. while (i < count) {
  132. while (j > -1 && first[i] != first[j])
  133. j = skip_[j];
  134. i++;
  135. j++;
  136. if (first[i] == first[j])
  137. skip_[i] = skip_[j];
  138. else
  139. skip_[i] = j;
  140. }
  141. }
  142. void init_skip_table ( patIter first, patIter last ) {
  143. const difference_type count = std::distance ( first, last );
  144. difference_type j;
  145. skip_ [ 0 ] = -1;
  146. for ( int i = 1; i <= count; ++i ) {
  147. j = skip_ [ i - 1 ];
  148. while ( j >= 0 ) {
  149. if ( first [ j ] == first [ i - 1 ] )
  150. break;
  151. j = skip_ [ j ];
  152. }
  153. skip_ [ i ] = j + 1;
  154. }
  155. }
  156. // \endcond
  157. };
  158. /* Two ranges as inputs gives us four possibilities; with 2,3,3,4 parameters
  159. Use a bit of TMP to disambiguate the 3-argument templates */
  160. /// \fn knuth_morris_pratt_search ( corpusIter corpus_first, corpusIter corpus_last,
  161. /// patIter pat_first, patIter pat_last )
  162. /// \brief Searches the corpus for the pattern.
  163. ///
  164. /// \param corpus_first The start of the data to search (Random Access Iterator)
  165. /// \param corpus_last One past the end of the data to search
  166. /// \param pat_first The start of the pattern to search for (Random Access Iterator)
  167. /// \param pat_last One past the end of the data to search for
  168. ///
  169. template <typename patIter, typename corpusIter>
  170. std::pair<corpusIter, corpusIter> knuth_morris_pratt_search (
  171. corpusIter corpus_first, corpusIter corpus_last,
  172. patIter pat_first, patIter pat_last )
  173. {
  174. knuth_morris_pratt<patIter> kmp ( pat_first, pat_last );
  175. return kmp ( corpus_first, corpus_last );
  176. }
  177. template <typename PatternRange, typename corpusIter>
  178. std::pair<corpusIter, corpusIter> knuth_morris_pratt_search (
  179. corpusIter corpus_first, corpusIter corpus_last, const PatternRange &pattern )
  180. {
  181. typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
  182. knuth_morris_pratt<pattern_iterator> kmp ( boost::begin(pattern), boost::end (pattern));
  183. return kmp ( corpus_first, corpus_last );
  184. }
  185. template <typename patIter, typename CorpusRange>
  186. typename boost::disable_if_c<
  187. boost::is_same<CorpusRange, patIter>::value,
  188. std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type> >
  189. ::type
  190. knuth_morris_pratt_search ( CorpusRange &corpus, patIter pat_first, patIter pat_last )
  191. {
  192. knuth_morris_pratt<patIter> kmp ( pat_first, pat_last );
  193. return kmp (boost::begin (corpus), boost::end (corpus));
  194. }
  195. template <typename PatternRange, typename CorpusRange>
  196. std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type>
  197. knuth_morris_pratt_search ( CorpusRange &corpus, const PatternRange &pattern )
  198. {
  199. typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
  200. knuth_morris_pratt<pattern_iterator> kmp ( boost::begin(pattern), boost::end (pattern));
  201. return kmp (boost::begin (corpus), boost::end (corpus));
  202. }
  203. // Creator functions -- take a pattern range, return an object
  204. template <typename Range>
  205. boost::algorithm::knuth_morris_pratt<typename boost::range_iterator<const Range>::type>
  206. make_knuth_morris_pratt ( const Range &r ) {
  207. return boost::algorithm::knuth_morris_pratt
  208. <typename boost::range_iterator<const Range>::type> (boost::begin(r), boost::end(r));
  209. }
  210. template <typename Range>
  211. boost::algorithm::knuth_morris_pratt<typename boost::range_iterator<Range>::type>
  212. make_knuth_morris_pratt ( Range &r ) {
  213. return boost::algorithm::knuth_morris_pratt
  214. <typename boost::range_iterator<Range>::type> (boost::begin(r), boost::end(r));
  215. }
  216. }}
  217. #endif // BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP