1 /*
2    Copyright (c) Marshall Clow 2010-2012.
3 
4    Distributed under the Boost Software License, Version 1.0. (See accompanying
5    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 
7     For more information, see http://www.boost.org
8 */
9 
10 #ifndef BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP
11 #define BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP
12 
13 #include <vector>
14 #include <iterator>     // for std::iterator_traits
15 
16 #include <boost/config.hpp>
17 #include <boost/assert.hpp>
18 #include <boost/static_assert.hpp>
19 
20 #include <boost/range/begin.hpp>
21 #include <boost/range/end.hpp>
22 
23 #include <boost/utility/enable_if.hpp>
24 #include <boost/type_traits/is_same.hpp>
25 
26 #include <boost/algorithm/searching/detail/debugging.hpp>
27 
28 // #define  BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_DEBUG
29 
30 namespace boost { namespace algorithm {
31 
32 // #define  NEW_KMP
33 
34 /*
35     A templated version of the Knuth-Morris-Pratt searching algorithm.
36 
37     Requirements:
38         * Random-access iterators
39         * The two iterator types (I1 and I2) must "point to" the same underlying type.
40 
41     http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
42     http://www.inf.fh-flensburg.de/lang/algorithmen/pattern/kmpen.htm
43 */
44 
45     template <typename patIter>
46     class knuth_morris_pratt {
47         typedef typename std::iterator_traits<patIter>::difference_type difference_type;
48     public:
knuth_morris_pratt(patIter first,patIter last)49         knuth_morris_pratt ( patIter first, patIter last )
50                 : pat_first ( first ), pat_last ( last ),
51                   k_pattern_length ( std::distance ( pat_first, pat_last )),
52                   skip_ ( k_pattern_length + 1 ) {
53 #ifdef NEW_KMP
54             preKmp ( pat_first, pat_last );
55 #else
56             init_skip_table ( pat_first, pat_last );
57 #endif
58 #ifdef BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_DEBUG
59             detail::PrintTable ( skip_.begin (), skip_.end ());
60 #endif
61             }
62 
~knuth_morris_pratt()63         ~knuth_morris_pratt () {}
64 
65         /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
66         /// \brief Searches the corpus for the pattern that was passed into the constructor
67         ///
68         /// \param corpus_first The start of the data to search (Random Access Iterator)
69         /// \param corpus_last  One past the end of the data to search
70         /// \param p            A predicate used for the search comparisons.
71         ///
72         template <typename corpusIter>
73         std::pair<corpusIter, corpusIter>
operator ()(corpusIter corpus_first,corpusIter corpus_last) const74         operator () ( corpusIter corpus_first, corpusIter corpus_last ) const {
75             BOOST_STATIC_ASSERT (( boost::is_same<
76                 typename std::iterator_traits<patIter>::value_type,
77                 typename std::iterator_traits<corpusIter>::value_type>::value ));
78 
79             if ( corpus_first == corpus_last ) return std::make_pair(corpus_last, corpus_last);   // if nothing to search, we didn't find it!
80             if (    pat_first ==    pat_last ) return std::make_pair(corpus_first, corpus_first); // empty pattern matches at start
81 
82             const difference_type k_corpus_length = std::distance ( corpus_first, corpus_last );
83         //  If the pattern is larger than the corpus, we can't find it!
84             if ( k_corpus_length < k_pattern_length )
85                 return std::make_pair(corpus_last, corpus_last);
86 
87             return do_search ( corpus_first, corpus_last, k_corpus_length );
88             }
89 
90         template <typename Range>
91         std::pair<typename boost::range_iterator<Range>::type, typename boost::range_iterator<Range>::type>
operator ()(Range & r) const92         operator () ( Range &r ) const {
93             return (*this) (boost::begin(r), boost::end(r));
94             }
95 
96     private:
97 /// \cond DOXYGEN_HIDE
98         patIter pat_first, pat_last;
99         const difference_type k_pattern_length;
100         std::vector <difference_type> skip_;
101 
102         /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
103         /// \brief Searches the corpus for the pattern that was passed into the constructor
104         ///
105         /// \param corpus_first The start of the data to search (Random Access Iterator)
106         /// \param corpus_last  One past the end of the data to search
107         /// \param p            A predicate used for the search comparisons.
108         ///
109         template <typename corpusIter>
110         std::pair<corpusIter, corpusIter>
do_search(corpusIter corpus_first,corpusIter corpus_last,difference_type k_corpus_length) const111         do_search ( corpusIter corpus_first, corpusIter corpus_last,
112                                                 difference_type k_corpus_length ) const {
113             difference_type match_start = 0;  // position in the corpus that we're matching
114 
115 #ifdef NEW_KMP
116             int patternIdx = 0;
117             while ( match_start < k_corpus_length ) {
118                 while ( patternIdx > -1 && pat_first[patternIdx] != corpus_first [match_start] )
119                     patternIdx = skip_ [patternIdx]; //<--- Shifting the pattern on mismatch
120 
121                 patternIdx++;
122                 match_start++; //<--- corpus is always increased by 1
123 
124                 if ( patternIdx >= (int) k_pattern_length )
125                     return corpus_first + match_start - patternIdx;
126                 }
127 
128 #else
129 //  At this point, we know:
130 //          k_pattern_length <= k_corpus_length
131 //          for all elements of skip, it holds -1 .. k_pattern_length
132 //
133 //          In the loop, we have the following invariants
134 //              idx is in the range 0 .. k_pattern_length
135 //              match_start is in the range 0 .. k_corpus_length - k_pattern_length + 1
136 
137             const difference_type last_match = k_corpus_length - k_pattern_length;
138             difference_type idx = 0;          // position in the pattern we're comparing
139 
140             while ( match_start <= last_match ) {
141                 while ( pat_first [ idx ] == corpus_first [ match_start + idx ] ) {
142                     if ( ++idx == k_pattern_length )
143                         return std::make_pair(corpus_first + match_start, corpus_first + match_start + k_pattern_length);
144                     }
145             //  Figure out where to start searching again
146            //   assert ( idx - skip_ [ idx ] > 0 ); // we're always moving forward
147                 match_start += idx - skip_ [ idx ];
148                 idx = skip_ [ idx ] >= 0 ? skip_ [ idx ] : 0;
149            //   assert ( idx >= 0 && idx < k_pattern_length );
150                 }
151 #endif
152 
153         //  We didn't find anything
154             return std::make_pair(corpus_last, corpus_last);
155             }
156 
157 
preKmp(patIter first,patIter last)158         void preKmp ( patIter first, patIter last ) {
159            const difference_type count = std::distance ( first, last );
160 
161            difference_type i, j;
162 
163            i = 0;
164            j = skip_[0] = -1;
165            while (i < count) {
166               while (j > -1 && first[i] != first[j])
167                  j = skip_[j];
168               i++;
169               j++;
170               if (first[i] == first[j])
171                  skip_[i] = skip_[j];
172               else
173                  skip_[i] = j;
174            }
175         }
176 
177 
init_skip_table(patIter first,patIter last)178         void init_skip_table ( patIter first, patIter last ) {
179             const difference_type count = std::distance ( first, last );
180 
181             difference_type j;
182             skip_ [ 0 ] = -1;
183             for ( int i = 1; i <= count; ++i ) {
184                 j = skip_ [ i - 1 ];
185                 while ( j >= 0 ) {
186                     if ( first [ j ] == first [ i - 1 ] )
187                         break;
188                     j = skip_ [ j ];
189                     }
190                 skip_ [ i ] = j + 1;
191                 }
192             }
193 // \endcond
194         };
195 
196 
197 /*  Two ranges as inputs gives us four possibilities; with 2,3,3,4 parameters
198     Use a bit of TMP to disambiguate the 3-argument templates */
199 
200 /// \fn knuth_morris_pratt_search ( corpusIter corpus_first, corpusIter corpus_last,
201 ///       patIter pat_first, patIter pat_last )
202 /// \brief Searches the corpus for the pattern.
203 ///
204 /// \param corpus_first The start of the data to search (Random Access Iterator)
205 /// \param corpus_last  One past the end of the data to search
206 /// \param pat_first    The start of the pattern to search for (Random Access Iterator)
207 /// \param pat_last     One past the end of the data to search for
208 ///
209     template <typename patIter, typename corpusIter>
knuth_morris_pratt_search(corpusIter corpus_first,corpusIter corpus_last,patIter pat_first,patIter pat_last)210     std::pair<corpusIter, corpusIter> knuth_morris_pratt_search (
211                   corpusIter corpus_first, corpusIter corpus_last,
212                   patIter pat_first, patIter pat_last )
213     {
214         knuth_morris_pratt<patIter> kmp ( pat_first, pat_last );
215         return kmp ( corpus_first, corpus_last );
216     }
217 
218     template <typename PatternRange, typename corpusIter>
knuth_morris_pratt_search(corpusIter corpus_first,corpusIter corpus_last,const PatternRange & pattern)219     std::pair<corpusIter, corpusIter> knuth_morris_pratt_search (
220         corpusIter corpus_first, corpusIter corpus_last, const PatternRange &pattern )
221     {
222         typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
223         knuth_morris_pratt<pattern_iterator> kmp ( boost::begin(pattern), boost::end (pattern));
224         return kmp ( corpus_first, corpus_last );
225     }
226 
227     template <typename patIter, typename CorpusRange>
228     typename boost::disable_if_c<
229         boost::is_same<CorpusRange, patIter>::value,
230         std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type> >
231     ::type
knuth_morris_pratt_search(CorpusRange & corpus,patIter pat_first,patIter pat_last)232     knuth_morris_pratt_search ( CorpusRange &corpus, patIter pat_first, patIter pat_last )
233     {
234         knuth_morris_pratt<patIter> kmp ( pat_first, pat_last );
235         return kmp (boost::begin (corpus), boost::end (corpus));
236     }
237 
238     template <typename PatternRange, typename CorpusRange>
239     std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type>
knuth_morris_pratt_search(CorpusRange & corpus,const PatternRange & pattern)240     knuth_morris_pratt_search ( CorpusRange &corpus, const PatternRange &pattern )
241     {
242         typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
243         knuth_morris_pratt<pattern_iterator> kmp ( boost::begin(pattern), boost::end (pattern));
244         return kmp (boost::begin (corpus), boost::end (corpus));
245     }
246 
247 
248     //  Creator functions -- take a pattern range, return an object
249     template <typename Range>
250     boost::algorithm::knuth_morris_pratt<typename boost::range_iterator<const Range>::type>
make_knuth_morris_pratt(const Range & r)251     make_knuth_morris_pratt ( const Range &r ) {
252         return boost::algorithm::knuth_morris_pratt
253             <typename boost::range_iterator<const Range>::type> (boost::begin(r), boost::end(r));
254         }
255 
256     template <typename Range>
257     boost::algorithm::knuth_morris_pratt<typename boost::range_iterator<Range>::type>
make_knuth_morris_pratt(Range & r)258     make_knuth_morris_pratt ( Range &r ) {
259         return boost::algorithm::knuth_morris_pratt
260             <typename boost::range_iterator<Range>::type> (boost::begin(r), boost::end(r));
261         }
262 }}
263 
264 #endif  // BOOST_ALGORITHM_KNUTH_MORRIS_PRATT_SEARCH_HPP
265