xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/searchset/searchset.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package searchset generates hashes for all substrings of a text. Potential
16*46c4c49dSIbrahim Kanouche// matches between two SearchSet objects can then be determined quickly.
17*46c4c49dSIbrahim Kanouche// Generating the hashes can be expensive, so it's best to perform it once. If
18*46c4c49dSIbrahim Kanouche// the text is part of a known corpus, then the SearchSet can be serialized and
19*46c4c49dSIbrahim Kanouche// kept in an archive.
20*46c4c49dSIbrahim Kanouche//
21*46c4c49dSIbrahim Kanouche// Matching occurs by "mapping" ranges from the source text into the target
22*46c4c49dSIbrahim Kanouche// text but still retaining the source order:
23*46c4c49dSIbrahim Kanouche//
24*46c4c49dSIbrahim Kanouche//	SOURCE: |-----------------------------|
25*46c4c49dSIbrahim Kanouche//
26*46c4c49dSIbrahim Kanouche//	TARGET: |*****************************************|
27*46c4c49dSIbrahim Kanouche//
28*46c4c49dSIbrahim Kanouche//	MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER:
29*46c4c49dSIbrahim Kanouche//
30*46c4c49dSIbrahim Kanouche//	  S:  |-[--]-----[---]------[----]------|
31*46c4c49dSIbrahim Kanouche//	         /         |           \
32*46c4c49dSIbrahim Kanouche//	      |---|   |---------|   |-------------|
33*46c4c49dSIbrahim Kanouche//	  T: |*****************************************|
34*46c4c49dSIbrahim Kanouche//
35*46c4c49dSIbrahim Kanouche// Note that a single source range may match many different ranges in the
36*46c4c49dSIbrahim Kanouche// target. The matching algorithm untangles these so that all matched ranges
37*46c4c49dSIbrahim Kanouche// are in order with respect to the source ranges. This is especially important
38*46c4c49dSIbrahim Kanouche// since the source text may occur more than once in the target text. The
39*46c4c49dSIbrahim Kanouche// algorithm finds each potential occurrence of S in T and returns all as
40*46c4c49dSIbrahim Kanouche// potential matched ranges.
41*46c4c49dSIbrahim Kanouchepackage searchset
42*46c4c49dSIbrahim Kanouche
43*46c4c49dSIbrahim Kanoucheimport (
44*46c4c49dSIbrahim Kanouche	"encoding/gob"
45*46c4c49dSIbrahim Kanouche	"fmt"
46*46c4c49dSIbrahim Kanouche	"io"
47*46c4c49dSIbrahim Kanouche	"sort"
48*46c4c49dSIbrahim Kanouche
49*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer"
50*46c4c49dSIbrahim Kanouche)
51*46c4c49dSIbrahim Kanouche
52*46c4c49dSIbrahim Kanouche// DefaultGranularity is the minimum size (in words) of the hash chunks.
53*46c4c49dSIbrahim Kanoucheconst DefaultGranularity = 3
54*46c4c49dSIbrahim Kanouche
55*46c4c49dSIbrahim Kanouche// SearchSet is a set of substrings that have hashes associated with them,
56*46c4c49dSIbrahim Kanouche// making it fast to search for potential matches.
57*46c4c49dSIbrahim Kanouchetype SearchSet struct {
58*46c4c49dSIbrahim Kanouche	// Tokens is a tokenized list of the original input string.
59*46c4c49dSIbrahim Kanouche	Tokens tokenizer.Tokens
60*46c4c49dSIbrahim Kanouche	// Hashes is a map of checksums to a range of tokens.
61*46c4c49dSIbrahim Kanouche	Hashes tokenizer.Hash
62*46c4c49dSIbrahim Kanouche	// Checksums is a list of checksums ordered from longest range to
63*46c4c49dSIbrahim Kanouche	// shortest.
64*46c4c49dSIbrahim Kanouche	Checksums []uint32
65*46c4c49dSIbrahim Kanouche	// ChecksumRanges are the token ranges for the above checksums.
66*46c4c49dSIbrahim Kanouche	ChecksumRanges tokenizer.TokenRanges
67*46c4c49dSIbrahim Kanouche
68*46c4c49dSIbrahim Kanouche	nodes []*node
69*46c4c49dSIbrahim Kanouche}
70*46c4c49dSIbrahim Kanouche
71*46c4c49dSIbrahim Kanouche// node consists of a range of tokens along with the checksum for those tokens.
72*46c4c49dSIbrahim Kanouchetype node struct {
73*46c4c49dSIbrahim Kanouche	checksum uint32
74*46c4c49dSIbrahim Kanouche	tokens   *tokenizer.TokenRange
75*46c4c49dSIbrahim Kanouche}
76*46c4c49dSIbrahim Kanouche
77*46c4c49dSIbrahim Kanouchefunc (n *node) String() string {
78*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
79*46c4c49dSIbrahim Kanouche}
80*46c4c49dSIbrahim Kanouche
81*46c4c49dSIbrahim Kanouche// New creates a new SearchSet object. It generates a hash for each substring of "s".
82*46c4c49dSIbrahim Kanouchefunc New(s string, granularity int) *SearchSet {
83*46c4c49dSIbrahim Kanouche	toks := tokenizer.Tokenize(s)
84*46c4c49dSIbrahim Kanouche
85*46c4c49dSIbrahim Kanouche	// Start generating hash values for all substrings within the text.
86*46c4c49dSIbrahim Kanouche	h := make(tokenizer.Hash)
87*46c4c49dSIbrahim Kanouche	checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int {
88*46c4c49dSIbrahim Kanouche		if a < b {
89*46c4c49dSIbrahim Kanouche			return a
90*46c4c49dSIbrahim Kanouche		}
91*46c4c49dSIbrahim Kanouche		return b
92*46c4c49dSIbrahim Kanouche	}(len(toks), granularity))
93*46c4c49dSIbrahim Kanouche	sset := &SearchSet{
94*46c4c49dSIbrahim Kanouche		Tokens:         toks,
95*46c4c49dSIbrahim Kanouche		Hashes:         h,
96*46c4c49dSIbrahim Kanouche		Checksums:      checksums,
97*46c4c49dSIbrahim Kanouche		ChecksumRanges: tokenRanges,
98*46c4c49dSIbrahim Kanouche	}
99*46c4c49dSIbrahim Kanouche	sset.GenerateNodeList()
100*46c4c49dSIbrahim Kanouche	return sset
101*46c4c49dSIbrahim Kanouche}
102*46c4c49dSIbrahim Kanouche
103*46c4c49dSIbrahim Kanouche// GenerateNodeList creates a node list out of the search set.
104*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) GenerateNodeList() {
105*46c4c49dSIbrahim Kanouche	if len(s.Tokens) == 0 {
106*46c4c49dSIbrahim Kanouche		return
107*46c4c49dSIbrahim Kanouche	}
108*46c4c49dSIbrahim Kanouche
109*46c4c49dSIbrahim Kanouche	for i := 0; i < len(s.Checksums); i++ {
110*46c4c49dSIbrahim Kanouche		s.nodes = append(s.nodes, &node{
111*46c4c49dSIbrahim Kanouche			checksum: s.Checksums[i],
112*46c4c49dSIbrahim Kanouche			tokens:   s.ChecksumRanges[i],
113*46c4c49dSIbrahim Kanouche		})
114*46c4c49dSIbrahim Kanouche	}
115*46c4c49dSIbrahim Kanouche}
116*46c4c49dSIbrahim Kanouche
117*46c4c49dSIbrahim Kanouche// Serialize emits the SearchSet out so that it can be recreated at a later
118*46c4c49dSIbrahim Kanouche// time.
119*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) Serialize(w io.Writer) error {
120*46c4c49dSIbrahim Kanouche	return gob.NewEncoder(w).Encode(s)
121*46c4c49dSIbrahim Kanouche}
122*46c4c49dSIbrahim Kanouche
123*46c4c49dSIbrahim Kanouche// Deserialize reads a file with a serialized SearchSet in it and reconstructs it.
124*46c4c49dSIbrahim Kanouchefunc Deserialize(r io.Reader, s *SearchSet) error {
125*46c4c49dSIbrahim Kanouche	if err := gob.NewDecoder(r).Decode(&s); err != nil {
126*46c4c49dSIbrahim Kanouche		return err
127*46c4c49dSIbrahim Kanouche	}
128*46c4c49dSIbrahim Kanouche	s.GenerateNodeList()
129*46c4c49dSIbrahim Kanouche	return nil
130*46c4c49dSIbrahim Kanouche}
131*46c4c49dSIbrahim Kanouche
132*46c4c49dSIbrahim Kanouche// MatchRange is the range within the source text that is a match to the range
133*46c4c49dSIbrahim Kanouche// in the target text.
134*46c4c49dSIbrahim Kanouchetype MatchRange struct {
135*46c4c49dSIbrahim Kanouche	// Offsets into the source tokens.
136*46c4c49dSIbrahim Kanouche	SrcStart, SrcEnd int
137*46c4c49dSIbrahim Kanouche	// Offsets into the target tokens.
138*46c4c49dSIbrahim Kanouche	TargetStart, TargetEnd int
139*46c4c49dSIbrahim Kanouche}
140*46c4c49dSIbrahim Kanouche
141*46c4c49dSIbrahim Kanouche// in returns true if the start and end are enclosed in the match range.
142*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) in(start, end int) bool {
143*46c4c49dSIbrahim Kanouche	return start >= m.TargetStart && end <= m.TargetEnd
144*46c4c49dSIbrahim Kanouche}
145*46c4c49dSIbrahim Kanouche
146*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) String() string {
147*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
148*46c4c49dSIbrahim Kanouche}
149*46c4c49dSIbrahim Kanouche
150*46c4c49dSIbrahim Kanouche// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
151*46c4c49dSIbrahim Kanouche// increasing in value and indicate a single potential occurrence of the source
152*46c4c49dSIbrahim Kanouche// text in the target text.
153*46c4c49dSIbrahim Kanouchetype MatchRanges []*MatchRange
154*46c4c49dSIbrahim Kanouche
155*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Len() int      { return len(m) }
156*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
157*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Less(i, j int) bool {
158*46c4c49dSIbrahim Kanouche	if m[i].TargetStart < m[j].TargetStart {
159*46c4c49dSIbrahim Kanouche		return true
160*46c4c49dSIbrahim Kanouche	}
161*46c4c49dSIbrahim Kanouche	return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart
162*46c4c49dSIbrahim Kanouche}
163*46c4c49dSIbrahim Kanouche
164*46c4c49dSIbrahim Kanouche// TargetRange is the start and stop token offsets into the target text.
165*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
166*46c4c49dSIbrahim Kanouche	start = target.Tokens[m[0].TargetStart].Offset
167*46c4c49dSIbrahim Kanouche	end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text)
168*46c4c49dSIbrahim Kanouche	return start, end
169*46c4c49dSIbrahim Kanouche}
170*46c4c49dSIbrahim Kanouche
171*46c4c49dSIbrahim Kanouche// Size is the number of source tokens that were matched.
172*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Size() int {
173*46c4c49dSIbrahim Kanouche	sum := 0
174*46c4c49dSIbrahim Kanouche	for _, mr := range m {
175*46c4c49dSIbrahim Kanouche		sum += mr.SrcEnd - mr.SrcStart
176*46c4c49dSIbrahim Kanouche	}
177*46c4c49dSIbrahim Kanouche	return sum
178*46c4c49dSIbrahim Kanouche}
179*46c4c49dSIbrahim Kanouche
180*46c4c49dSIbrahim Kanouche// FindPotentialMatches returns the ranges in the target (unknown) text that
181*46c4c49dSIbrahim Kanouche// are best potential matches to the source (known) text.
182*46c4c49dSIbrahim Kanouchefunc FindPotentialMatches(src, target *SearchSet) []MatchRanges {
183*46c4c49dSIbrahim Kanouche	matchedRanges := getMatchedRanges(src, target)
184*46c4c49dSIbrahim Kanouche	if len(matchedRanges) == 0 {
185*46c4c49dSIbrahim Kanouche		return nil
186*46c4c49dSIbrahim Kanouche	}
187*46c4c49dSIbrahim Kanouche
188*46c4c49dSIbrahim Kanouche	// Cleanup the matching ranges so that we get the longest contiguous ranges.
189*46c4c49dSIbrahim Kanouche	for i := 0; i < len(matchedRanges); i++ {
190*46c4c49dSIbrahim Kanouche		matchedRanges[i] = coalesceMatchRanges(matchedRanges[i])
191*46c4c49dSIbrahim Kanouche	}
192*46c4c49dSIbrahim Kanouche	return matchedRanges
193*46c4c49dSIbrahim Kanouche}
194*46c4c49dSIbrahim Kanouche
195*46c4c49dSIbrahim Kanouche// getMatchedRanges finds the ranges in the target text that match the source
196*46c4c49dSIbrahim Kanouche// text. There can be multiple occurrences of the source text within the target
197*46c4c49dSIbrahim Kanouche// text. Each separate occurrence is an entry in the returned slice.
198*46c4c49dSIbrahim Kanouchefunc getMatchedRanges(src, target *SearchSet) []MatchRanges {
199*46c4c49dSIbrahim Kanouche	matched := targetMatchedRanges(src, target)
200*46c4c49dSIbrahim Kanouche	if len(matched) == 0 {
201*46c4c49dSIbrahim Kanouche		return nil
202*46c4c49dSIbrahim Kanouche	}
203*46c4c49dSIbrahim Kanouche	sort.Sort(matched)
204*46c4c49dSIbrahim Kanouche	matched = untangleSourceRanges(matched)
205*46c4c49dSIbrahim Kanouche	matchedRanges := splitRanges(matched)
206*46c4c49dSIbrahim Kanouche	return mergeConsecutiveRanges(matchedRanges)
207*46c4c49dSIbrahim Kanouche}
208*46c4c49dSIbrahim Kanouche
209*46c4c49dSIbrahim Kanouchefunc extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
210*46c4c49dSIbrahim Kanouche	if len(mr) == 0 {
211*46c4c49dSIbrahim Kanouche		return false
212*46c4c49dSIbrahim Kanouche	}
213*46c4c49dSIbrahim Kanouche	for _, tv := range tr {
214*46c4c49dSIbrahim Kanouche		for _, mv := range mr {
215*46c4c49dSIbrahim Kanouche			if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
216*46c4c49dSIbrahim Kanouche				return true
217*46c4c49dSIbrahim Kanouche			}
218*46c4c49dSIbrahim Kanouche		}
219*46c4c49dSIbrahim Kanouche	}
220*46c4c49dSIbrahim Kanouche	return false
221*46c4c49dSIbrahim Kanouche}
222*46c4c49dSIbrahim Kanouche
223*46c4c49dSIbrahim Kanouche// targetMatchedRanges finds matching sequences in target and src ordered by target position
224*46c4c49dSIbrahim Kanouchefunc targetMatchedRanges(src, target *SearchSet) MatchRanges {
225*46c4c49dSIbrahim Kanouche	if src.nodes == nil {
226*46c4c49dSIbrahim Kanouche		return nil
227*46c4c49dSIbrahim Kanouche	}
228*46c4c49dSIbrahim Kanouche
229*46c4c49dSIbrahim Kanouche	var matched MatchRanges
230*46c4c49dSIbrahim Kanouche	var previous *node
231*46c4c49dSIbrahim Kanouche	var possible []MatchRanges
232*46c4c49dSIbrahim Kanouche	for _, tgtNode := range target.nodes {
233*46c4c49dSIbrahim Kanouche		sr, ok := src.Hashes[tgtNode.checksum]
234*46c4c49dSIbrahim Kanouche		if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
235*46c4c49dSIbrahim Kanouche			for _, r := range possible {
236*46c4c49dSIbrahim Kanouche				matched = append(matched, r...)
237*46c4c49dSIbrahim Kanouche			}
238*46c4c49dSIbrahim Kanouche			possible = possible[:0]
239*46c4c49dSIbrahim Kanouche			previous = nil
240*46c4c49dSIbrahim Kanouche		}
241*46c4c49dSIbrahim Kanouche		if !ok {
242*46c4c49dSIbrahim Kanouche			// There isn't a match in the source.
243*46c4c49dSIbrahim Kanouche			continue
244*46c4c49dSIbrahim Kanouche		}
245*46c4c49dSIbrahim Kanouche
246*46c4c49dSIbrahim Kanouche		// Maps index within `possible` to the slice of ranges extended by a new range
247*46c4c49dSIbrahim Kanouche		extended := make(map[int]*MatchRanges)
248*46c4c49dSIbrahim Kanouche		// Go over the set of source ranges growing lists of `possible` match ranges.
249*46c4c49dSIbrahim Kanouche		tv := tgtNode.tokens
250*46c4c49dSIbrahim Kanouche		for _, sv := range sr {
251*46c4c49dSIbrahim Kanouche			r := &MatchRange{
252*46c4c49dSIbrahim Kanouche				SrcStart:    sv.Start,
253*46c4c49dSIbrahim Kanouche				SrcEnd:      sv.End,
254*46c4c49dSIbrahim Kanouche				TargetStart: tv.Start,
255*46c4c49dSIbrahim Kanouche				TargetEnd:   tv.End,
256*46c4c49dSIbrahim Kanouche			}
257*46c4c49dSIbrahim Kanouche			found := false
258*46c4c49dSIbrahim Kanouche			// Grow or extend each abutting `possible` match range.
259*46c4c49dSIbrahim Kanouche			for i, p := range possible {
260*46c4c49dSIbrahim Kanouche				last := p[len(p)-1]
261*46c4c49dSIbrahim Kanouche				if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
262*46c4c49dSIbrahim Kanouche					found = true
263*46c4c49dSIbrahim Kanouche					possible[i] = append(possible[i], r)
264*46c4c49dSIbrahim Kanouche					extended[i] = &possible[i]
265*46c4c49dSIbrahim Kanouche				}
266*46c4c49dSIbrahim Kanouche			}
267*46c4c49dSIbrahim Kanouche			if !found {
268*46c4c49dSIbrahim Kanouche				// Did not abut any existing ranges, start a new `possible` match range.
269*46c4c49dSIbrahim Kanouche				mrs := make(MatchRanges, 0, 2)
270*46c4c49dSIbrahim Kanouche				mrs = append(mrs, r)
271*46c4c49dSIbrahim Kanouche				possible = append(possible, mrs)
272*46c4c49dSIbrahim Kanouche				extended[len(possible)-1] = &possible[len(possible)-1]
273*46c4c49dSIbrahim Kanouche			}
274*46c4c49dSIbrahim Kanouche		}
275*46c4c49dSIbrahim Kanouche		if len(extended) < len(possible) {
276*46c4c49dSIbrahim Kanouche			// Ranges not extended--add to `matched` if not included in other range.
277*46c4c49dSIbrahim Kanouche			for i := 0; i < len(possible); {
278*46c4c49dSIbrahim Kanouche				_, updated := extended[i]
279*46c4c49dSIbrahim Kanouche				if updated {
280*46c4c49dSIbrahim Kanouche					i++ // Keep in `possible` and advance to next index.
281*46c4c49dSIbrahim Kanouche					continue
282*46c4c49dSIbrahim Kanouche				}
283*46c4c49dSIbrahim Kanouche				p1 := possible[i]
284*46c4c49dSIbrahim Kanouche				found := false // whether found as subrange of another `possible` match.
285*46c4c49dSIbrahim Kanouche				for _, p2 := range extended {
286*46c4c49dSIbrahim Kanouche					if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
287*46c4c49dSIbrahim Kanouche						found = true
288*46c4c49dSIbrahim Kanouche						break
289*46c4c49dSIbrahim Kanouche					}
290*46c4c49dSIbrahim Kanouche				}
291*46c4c49dSIbrahim Kanouche				if !found {
292*46c4c49dSIbrahim Kanouche					matched = append(matched, p1...)
293*46c4c49dSIbrahim Kanouche				} // else included in other match.
294*46c4c49dSIbrahim Kanouche				// Finished -- delete from `possible` and continue from same index.
295*46c4c49dSIbrahim Kanouche				possible = append(possible[:i], possible[i+1:]...)
296*46c4c49dSIbrahim Kanouche			}
297*46c4c49dSIbrahim Kanouche		}
298*46c4c49dSIbrahim Kanouche		previous = tgtNode
299*46c4c49dSIbrahim Kanouche	}
300*46c4c49dSIbrahim Kanouche	// At end of file, terminate all `possible` match ranges.
301*46c4c49dSIbrahim Kanouche	for i := 0; i < len(possible); i++ {
302*46c4c49dSIbrahim Kanouche		p1 := possible[i]
303*46c4c49dSIbrahim Kanouche		found := false // whether found as subrange of another `possible` match.
304*46c4c49dSIbrahim Kanouche		for j := i + 1; j < len(possible); {
305*46c4c49dSIbrahim Kanouche			p2 := possible[j]
306*46c4c49dSIbrahim Kanouche			if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
307*46c4c49dSIbrahim Kanouche				// Delete later sub-ranges included in this range.
308*46c4c49dSIbrahim Kanouche				possible = append(possible[:j], possible[j+1:]...)
309*46c4c49dSIbrahim Kanouche				continue
310*46c4c49dSIbrahim Kanouche			}
311*46c4c49dSIbrahim Kanouche			// Skip if subrange of a later range
312*46c4c49dSIbrahim Kanouche			if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
313*46c4c49dSIbrahim Kanouche				found = true
314*46c4c49dSIbrahim Kanouche			}
315*46c4c49dSIbrahim Kanouche			j++
316*46c4c49dSIbrahim Kanouche		}
317*46c4c49dSIbrahim Kanouche		if !found {
318*46c4c49dSIbrahim Kanouche			matched = append(matched, p1...)
319*46c4c49dSIbrahim Kanouche		}
320*46c4c49dSIbrahim Kanouche	}
321*46c4c49dSIbrahim Kanouche	return matched
322*46c4c49dSIbrahim Kanouche}
323*46c4c49dSIbrahim Kanouche
324*46c4c49dSIbrahim Kanouche// untangleSourceRanges goes through the ranges and removes any whose source
325*46c4c49dSIbrahim Kanouche// ranges are "out of order". A source range is "out of order" if the source
326*46c4c49dSIbrahim Kanouche// range is out of sequence with the source ranges before and after it. This
327*46c4c49dSIbrahim Kanouche// happens when more than one source range maps to the same target range.
328*46c4c49dSIbrahim Kanouche// E.g.:
329*46c4c49dSIbrahim Kanouche//
330*46c4c49dSIbrahim Kanouche//	   SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137
331*46c4c49dSIbrahim Kanouche//	1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143
332*46c4c49dSIbrahim Kanouche//	2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143
333*46c4c49dSIbrahim Kanouche//	   SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146
334*46c4c49dSIbrahim Kanouche//
335*46c4c49dSIbrahim Kanouche// Here (1) is out of order, because the source range [12, 17) is out of
336*46c4c49dSIbrahim Kanouche// sequence with the surrounding source sequences, but [32, 37) is.
337*46c4c49dSIbrahim Kanouchefunc untangleSourceRanges(matched MatchRanges) MatchRanges {
338*46c4c49dSIbrahim Kanouche	mr := MatchRanges{matched[0]}
339*46c4c49dSIbrahim KanoucheNEXT:
340*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
341*46c4c49dSIbrahim Kanouche		if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd {
342*46c4c49dSIbrahim Kanouche			// The matched range has already been added.
343*46c4c49dSIbrahim Kanouche			continue
344*46c4c49dSIbrahim Kanouche		}
345*46c4c49dSIbrahim Kanouche
346*46c4c49dSIbrahim Kanouche		if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) {
347*46c4c49dSIbrahim Kanouche			// A sequence of ranges match the same target range.
348*46c4c49dSIbrahim Kanouche			// Find the first one that has a source range greater
349*46c4c49dSIbrahim Kanouche			// than the currently matched range. Omit all others.
350*46c4c49dSIbrahim Kanouche			if matched[i].SrcStart > mr[len(mr)-1].SrcStart {
351*46c4c49dSIbrahim Kanouche				mr = append(mr, matched[i])
352*46c4c49dSIbrahim Kanouche				continue
353*46c4c49dSIbrahim Kanouche			}
354*46c4c49dSIbrahim Kanouche
355*46c4c49dSIbrahim Kanouche			for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ {
356*46c4c49dSIbrahim Kanouche				// Check subsequent ranges to see if we can
357*46c4c49dSIbrahim Kanouche				// find one that matches in the correct order.
358*46c4c49dSIbrahim Kanouche				if matched[j].SrcStart > mr[len(mr)-1].SrcStart {
359*46c4c49dSIbrahim Kanouche					mr = append(mr, matched[j])
360*46c4c49dSIbrahim Kanouche					i = j
361*46c4c49dSIbrahim Kanouche					continue NEXT
362*46c4c49dSIbrahim Kanouche				}
363*46c4c49dSIbrahim Kanouche			}
364*46c4c49dSIbrahim Kanouche		}
365*46c4c49dSIbrahim Kanouche
366*46c4c49dSIbrahim Kanouche		mr = append(mr, matched[i])
367*46c4c49dSIbrahim Kanouche	}
368*46c4c49dSIbrahim Kanouche	return mr
369*46c4c49dSIbrahim Kanouche}
370*46c4c49dSIbrahim Kanouche
371*46c4c49dSIbrahim Kanouche// equalTargetRange returns true if the two MatchRange's cover the same target range.
372*46c4c49dSIbrahim Kanouchefunc equalTargetRange(this, that *MatchRange) bool {
373*46c4c49dSIbrahim Kanouche	return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd
374*46c4c49dSIbrahim Kanouche}
375*46c4c49dSIbrahim Kanouche
376*46c4c49dSIbrahim Kanouche// splitRanges splits the matched ranges so that a single match range has a
377*46c4c49dSIbrahim Kanouche// monotonically increasing source range (indicating a single, potential
378*46c4c49dSIbrahim Kanouche// instance of the source in the target).
379*46c4c49dSIbrahim Kanouchefunc splitRanges(matched MatchRanges) []MatchRanges {
380*46c4c49dSIbrahim Kanouche	var matchedRanges []MatchRanges
381*46c4c49dSIbrahim Kanouche	mr := MatchRanges{matched[0]}
382*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
383*46c4c49dSIbrahim Kanouche		if mr[len(mr)-1].SrcStart > matched[i].SrcStart {
384*46c4c49dSIbrahim Kanouche			matchedRanges = append(matchedRanges, mr)
385*46c4c49dSIbrahim Kanouche			mr = MatchRanges{matched[i]}
386*46c4c49dSIbrahim Kanouche		} else {
387*46c4c49dSIbrahim Kanouche			mr = append(mr, matched[i])
388*46c4c49dSIbrahim Kanouche		}
389*46c4c49dSIbrahim Kanouche	}
390*46c4c49dSIbrahim Kanouche	matchedRanges = append(matchedRanges, mr)
391*46c4c49dSIbrahim Kanouche	return matchedRanges
392*46c4c49dSIbrahim Kanouche}
393*46c4c49dSIbrahim Kanouche
394*46c4c49dSIbrahim Kanouche// mergeConsecutiveRanges goes through the matched ranges and merges
395*46c4c49dSIbrahim Kanouche// consecutive ranges. Two ranges are consecutive if the end of the previous
396*46c4c49dSIbrahim Kanouche// matched range and beginning of the next matched range overlap. "matched"
397*46c4c49dSIbrahim Kanouche// should have 1 or more MatchRanges, each with one or more MatchRange objects.
398*46c4c49dSIbrahim Kanouchefunc mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges {
399*46c4c49dSIbrahim Kanouche	mr := []MatchRanges{matched[0]}
400*46c4c49dSIbrahim Kanouche
401*46c4c49dSIbrahim Kanouche	// Convenience functions.
402*46c4c49dSIbrahim Kanouche	prevMatchedRange := func() MatchRanges {
403*46c4c49dSIbrahim Kanouche		return mr[len(mr)-1]
404*46c4c49dSIbrahim Kanouche	}
405*46c4c49dSIbrahim Kanouche	prevMatchedRangeLastElem := func() *MatchRange {
406*46c4c49dSIbrahim Kanouche		return prevMatchedRange()[len(prevMatchedRange())-1]
407*46c4c49dSIbrahim Kanouche	}
408*46c4c49dSIbrahim Kanouche
409*46c4c49dSIbrahim Kanouche	// This algorithm compares the start of each MatchRanges object to the
410*46c4c49dSIbrahim Kanouche	// end of the previous MatchRanges object. If they overlap, then it
411*46c4c49dSIbrahim Kanouche	// tries to combine them. Note that a 0 offset into a MatchRanges
412*46c4c49dSIbrahim Kanouche	// object (e.g., matched[i][0]) is its first MatchRange, which
413*46c4c49dSIbrahim Kanouche	// indicates the start of the whole matched range.
414*46c4c49dSIbrahim KanoucheNEXT:
415*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
416*46c4c49dSIbrahim Kanouche		if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart {
417*46c4c49dSIbrahim Kanouche			// Consecutive matched ranges overlap. Merge them.
418*46c4c49dSIbrahim Kanouche			if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart {
419*46c4c49dSIbrahim Kanouche				// The last element of the previous matched
420*46c4c49dSIbrahim Kanouche				// range overlaps with the first element of the
421*46c4c49dSIbrahim Kanouche				// current matched range. Concatenate them.
422*46c4c49dSIbrahim Kanouche				if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd {
423*46c4c49dSIbrahim Kanouche					prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd
424*46c4c49dSIbrahim Kanouche					prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd
425*46c4c49dSIbrahim Kanouche				}
426*46c4c49dSIbrahim Kanouche				mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...)
427*46c4c49dSIbrahim Kanouche				continue
428*46c4c49dSIbrahim Kanouche			}
429*46c4c49dSIbrahim Kanouche
430*46c4c49dSIbrahim Kanouche			for j := 1; j < len(matched[i]); j++ {
431*46c4c49dSIbrahim Kanouche				// Find the positions in the ranges where the
432*46c4c49dSIbrahim Kanouche				// tail end of the previous matched range
433*46c4c49dSIbrahim Kanouche				// overlaps with the start of the next matched
434*46c4c49dSIbrahim Kanouche				// range.
435*46c4c49dSIbrahim Kanouche				for k := len(prevMatchedRange()) - 1; k > 0; k-- {
436*46c4c49dSIbrahim Kanouche					if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart &&
437*46c4c49dSIbrahim Kanouche						prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart {
438*46c4c49dSIbrahim Kanouche						// Append the next range to the previous range.
439*46c4c49dSIbrahim Kanouche						if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart {
440*46c4c49dSIbrahim Kanouche							// Coalesce the ranges.
441*46c4c49dSIbrahim Kanouche							prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd
442*46c4c49dSIbrahim Kanouche							prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd
443*46c4c49dSIbrahim Kanouche						}
444*46c4c49dSIbrahim Kanouche						mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...)
445*46c4c49dSIbrahim Kanouche						continue NEXT
446*46c4c49dSIbrahim Kanouche					}
447*46c4c49dSIbrahim Kanouche				}
448*46c4c49dSIbrahim Kanouche			}
449*46c4c49dSIbrahim Kanouche		}
450*46c4c49dSIbrahim Kanouche		mr = append(mr, matched[i])
451*46c4c49dSIbrahim Kanouche	}
452*46c4c49dSIbrahim Kanouche	return mr
453*46c4c49dSIbrahim Kanouche}
454*46c4c49dSIbrahim Kanouche
455*46c4c49dSIbrahim Kanouche// coalesceMatchRanges coalesces overlapping match ranges into a single
456*46c4c49dSIbrahim Kanouche// contiguous match range.
457*46c4c49dSIbrahim Kanouchefunc coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges {
458*46c4c49dSIbrahim Kanouche	coalesced := MatchRanges{matchedRanges[0]}
459*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matchedRanges); i++ {
460*46c4c49dSIbrahim Kanouche		c := coalesced[len(coalesced)-1]
461*46c4c49dSIbrahim Kanouche		mr := matchedRanges[i]
462*46c4c49dSIbrahim Kanouche
463*46c4c49dSIbrahim Kanouche		if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart {
464*46c4c49dSIbrahim Kanouche			var se, ts, te int
465*46c4c49dSIbrahim Kanouche			if mr.SrcEnd > c.SrcEnd {
466*46c4c49dSIbrahim Kanouche				se = mr.SrcEnd
467*46c4c49dSIbrahim Kanouche			} else {
468*46c4c49dSIbrahim Kanouche				se = c.SrcEnd
469*46c4c49dSIbrahim Kanouche			}
470*46c4c49dSIbrahim Kanouche			if mr.TargetStart < c.TargetStart {
471*46c4c49dSIbrahim Kanouche				ts = mr.TargetStart
472*46c4c49dSIbrahim Kanouche			} else {
473*46c4c49dSIbrahim Kanouche				ts = c.TargetStart
474*46c4c49dSIbrahim Kanouche			}
475*46c4c49dSIbrahim Kanouche			if mr.TargetEnd > c.TargetEnd {
476*46c4c49dSIbrahim Kanouche				te = mr.TargetEnd
477*46c4c49dSIbrahim Kanouche			} else {
478*46c4c49dSIbrahim Kanouche				te = c.TargetEnd
479*46c4c49dSIbrahim Kanouche			}
480*46c4c49dSIbrahim Kanouche			coalesced[len(coalesced)-1] = &MatchRange{
481*46c4c49dSIbrahim Kanouche				SrcStart:    c.SrcStart,
482*46c4c49dSIbrahim Kanouche				SrcEnd:      se,
483*46c4c49dSIbrahim Kanouche				TargetStart: ts,
484*46c4c49dSIbrahim Kanouche				TargetEnd:   te,
485*46c4c49dSIbrahim Kanouche			}
486*46c4c49dSIbrahim Kanouche		} else {
487*46c4c49dSIbrahim Kanouche			coalesced = append(coalesced, mr)
488*46c4c49dSIbrahim Kanouche		}
489*46c4c49dSIbrahim Kanouche	}
490*46c4c49dSIbrahim Kanouche	return coalesced
491*46c4c49dSIbrahim Kanouche}
492