xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/searchset/searchset.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package searchset generates hashes for all substrings of a text. Potential
16// matches between two SearchSet objects can then be determined quickly.
17// Generating the hashes can be expensive, so it's best to perform it once. If
18// the text is part of a known corpus, then the SearchSet can be serialized and
19// kept in an archive.
20//
21// Matching occurs by "mapping" ranges from the source text into the target
22// text but still retaining the source order:
23//
24//	SOURCE: |-----------------------------|
25//
26//	TARGET: |*****************************************|
27//
28//	MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER:
29//
30//	  S:  |-[--]-----[---]------[----]------|
31//	         /         |           \
32//	      |---|   |---------|   |-------------|
33//	  T: |*****************************************|
34//
35// Note that a single source range may match many different ranges in the
36// target. The matching algorithm untangles these so that all matched ranges
37// are in order with respect to the source ranges. This is especially important
38// since the source text may occur more than once in the target text. The
39// algorithm finds each potential occurrence of S in T and returns all as
40// potential matched ranges.
41package searchset
42
43import (
44	"encoding/gob"
45	"fmt"
46	"io"
47	"sort"
48
49	"github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer"
50)
51
52// DefaultGranularity is the minimum size (in words) of the hash chunks.
53const DefaultGranularity = 3
54
55// SearchSet is a set of substrings that have hashes associated with them,
56// making it fast to search for potential matches.
57type SearchSet struct {
58	// Tokens is a tokenized list of the original input string.
59	Tokens tokenizer.Tokens
60	// Hashes is a map of checksums to a range of tokens.
61	Hashes tokenizer.Hash
62	// Checksums is a list of checksums ordered from longest range to
63	// shortest.
64	Checksums []uint32
65	// ChecksumRanges are the token ranges for the above checksums.
66	ChecksumRanges tokenizer.TokenRanges
67
68	nodes []*node
69}
70
71// node consists of a range of tokens along with the checksum for those tokens.
72type node struct {
73	checksum uint32
74	tokens   *tokenizer.TokenRange
75}
76
77func (n *node) String() string {
78	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
79}
80
81// New creates a new SearchSet object. It generates a hash for each substring of "s".
82func New(s string, granularity int) *SearchSet {
83	toks := tokenizer.Tokenize(s)
84
85	// Start generating hash values for all substrings within the text.
86	h := make(tokenizer.Hash)
87	checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int {
88		if a < b {
89			return a
90		}
91		return b
92	}(len(toks), granularity))
93	sset := &SearchSet{
94		Tokens:         toks,
95		Hashes:         h,
96		Checksums:      checksums,
97		ChecksumRanges: tokenRanges,
98	}
99	sset.GenerateNodeList()
100	return sset
101}
102
103// GenerateNodeList creates a node list out of the search set.
104func (s *SearchSet) GenerateNodeList() {
105	if len(s.Tokens) == 0 {
106		return
107	}
108
109	for i := 0; i < len(s.Checksums); i++ {
110		s.nodes = append(s.nodes, &node{
111			checksum: s.Checksums[i],
112			tokens:   s.ChecksumRanges[i],
113		})
114	}
115}
116
117// Serialize emits the SearchSet out so that it can be recreated at a later
118// time.
119func (s *SearchSet) Serialize(w io.Writer) error {
120	return gob.NewEncoder(w).Encode(s)
121}
122
123// Deserialize reads a file with a serialized SearchSet in it and reconstructs it.
124func Deserialize(r io.Reader, s *SearchSet) error {
125	if err := gob.NewDecoder(r).Decode(&s); err != nil {
126		return err
127	}
128	s.GenerateNodeList()
129	return nil
130}
131
132// MatchRange is the range within the source text that is a match to the range
133// in the target text.
134type MatchRange struct {
135	// Offsets into the source tokens.
136	SrcStart, SrcEnd int
137	// Offsets into the target tokens.
138	TargetStart, TargetEnd int
139}
140
141// in returns true if the start and end are enclosed in the match range.
142func (m *MatchRange) in(start, end int) bool {
143	return start >= m.TargetStart && end <= m.TargetEnd
144}
145
146func (m *MatchRange) String() string {
147	return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
148}
149
150// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
151// increasing in value and indicate a single potential occurrence of the source
152// text in the target text.
153type MatchRanges []*MatchRange
154
155func (m MatchRanges) Len() int      { return len(m) }
156func (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
157func (m MatchRanges) Less(i, j int) bool {
158	if m[i].TargetStart < m[j].TargetStart {
159		return true
160	}
161	return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart
162}
163
164// TargetRange is the start and stop token offsets into the target text.
165func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
166	start = target.Tokens[m[0].TargetStart].Offset
167	end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text)
168	return start, end
169}
170
171// Size is the number of source tokens that were matched.
172func (m MatchRanges) Size() int {
173	sum := 0
174	for _, mr := range m {
175		sum += mr.SrcEnd - mr.SrcStart
176	}
177	return sum
178}
179
180// FindPotentialMatches returns the ranges in the target (unknown) text that
181// are best potential matches to the source (known) text.
182func FindPotentialMatches(src, target *SearchSet) []MatchRanges {
183	matchedRanges := getMatchedRanges(src, target)
184	if len(matchedRanges) == 0 {
185		return nil
186	}
187
188	// Cleanup the matching ranges so that we get the longest contiguous ranges.
189	for i := 0; i < len(matchedRanges); i++ {
190		matchedRanges[i] = coalesceMatchRanges(matchedRanges[i])
191	}
192	return matchedRanges
193}
194
195// getMatchedRanges finds the ranges in the target text that match the source
196// text. There can be multiple occurrences of the source text within the target
197// text. Each separate occurrence is an entry in the returned slice.
198func getMatchedRanges(src, target *SearchSet) []MatchRanges {
199	matched := targetMatchedRanges(src, target)
200	if len(matched) == 0 {
201		return nil
202	}
203	sort.Sort(matched)
204	matched = untangleSourceRanges(matched)
205	matchedRanges := splitRanges(matched)
206	return mergeConsecutiveRanges(matchedRanges)
207}
208
209func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
210	if len(mr) == 0 {
211		return false
212	}
213	for _, tv := range tr {
214		for _, mv := range mr {
215			if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
216				return true
217			}
218		}
219	}
220	return false
221}
222
223// targetMatchedRanges finds matching sequences in target and src ordered by target position
224func targetMatchedRanges(src, target *SearchSet) MatchRanges {
225	if src.nodes == nil {
226		return nil
227	}
228
229	var matched MatchRanges
230	var previous *node
231	var possible []MatchRanges
232	for _, tgtNode := range target.nodes {
233		sr, ok := src.Hashes[tgtNode.checksum]
234		if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
235			for _, r := range possible {
236				matched = append(matched, r...)
237			}
238			possible = possible[:0]
239			previous = nil
240		}
241		if !ok {
242			// There isn't a match in the source.
243			continue
244		}
245
246		// Maps index within `possible` to the slice of ranges extended by a new range
247		extended := make(map[int]*MatchRanges)
248		// Go over the set of source ranges growing lists of `possible` match ranges.
249		tv := tgtNode.tokens
250		for _, sv := range sr {
251			r := &MatchRange{
252				SrcStart:    sv.Start,
253				SrcEnd:      sv.End,
254				TargetStart: tv.Start,
255				TargetEnd:   tv.End,
256			}
257			found := false
258			// Grow or extend each abutting `possible` match range.
259			for i, p := range possible {
260				last := p[len(p)-1]
261				if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
262					found = true
263					possible[i] = append(possible[i], r)
264					extended[i] = &possible[i]
265				}
266			}
267			if !found {
268				// Did not abut any existing ranges, start a new `possible` match range.
269				mrs := make(MatchRanges, 0, 2)
270				mrs = append(mrs, r)
271				possible = append(possible, mrs)
272				extended[len(possible)-1] = &possible[len(possible)-1]
273			}
274		}
275		if len(extended) < len(possible) {
276			// Ranges not extended--add to `matched` if not included in other range.
277			for i := 0; i < len(possible); {
278				_, updated := extended[i]
279				if updated {
280					i++ // Keep in `possible` and advance to next index.
281					continue
282				}
283				p1 := possible[i]
284				found := false // whether found as subrange of another `possible` match.
285				for _, p2 := range extended {
286					if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
287						found = true
288						break
289					}
290				}
291				if !found {
292					matched = append(matched, p1...)
293				} // else included in other match.
294				// Finished -- delete from `possible` and continue from same index.
295				possible = append(possible[:i], possible[i+1:]...)
296			}
297		}
298		previous = tgtNode
299	}
300	// At end of file, terminate all `possible` match ranges.
301	for i := 0; i < len(possible); i++ {
302		p1 := possible[i]
303		found := false // whether found as subrange of another `possible` match.
304		for j := i + 1; j < len(possible); {
305			p2 := possible[j]
306			if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
307				// Delete later sub-ranges included in this range.
308				possible = append(possible[:j], possible[j+1:]...)
309				continue
310			}
311			// Skip if subrange of a later range
312			if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
313				found = true
314			}
315			j++
316		}
317		if !found {
318			matched = append(matched, p1...)
319		}
320	}
321	return matched
322}
323
324// untangleSourceRanges goes through the ranges and removes any whose source
325// ranges are "out of order". A source range is "out of order" if the source
326// range is out of sequence with the source ranges before and after it. This
327// happens when more than one source range maps to the same target range.
328// E.g.:
329//
330//	   SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137
331//	1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143
332//	2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143
333//	   SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146
334//
335// Here (1) is out of order, because the source range [12, 17) is out of
336// sequence with the surrounding source sequences, but [32, 37) is.
337func untangleSourceRanges(matched MatchRanges) MatchRanges {
338	mr := MatchRanges{matched[0]}
339NEXT:
340	for i := 1; i < len(matched); i++ {
341		if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd {
342			// The matched range has already been added.
343			continue
344		}
345
346		if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) {
347			// A sequence of ranges match the same target range.
348			// Find the first one that has a source range greater
349			// than the currently matched range. Omit all others.
350			if matched[i].SrcStart > mr[len(mr)-1].SrcStart {
351				mr = append(mr, matched[i])
352				continue
353			}
354
355			for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ {
356				// Check subsequent ranges to see if we can
357				// find one that matches in the correct order.
358				if matched[j].SrcStart > mr[len(mr)-1].SrcStart {
359					mr = append(mr, matched[j])
360					i = j
361					continue NEXT
362				}
363			}
364		}
365
366		mr = append(mr, matched[i])
367	}
368	return mr
369}
370
371// equalTargetRange returns true if the two MatchRange's cover the same target range.
372func equalTargetRange(this, that *MatchRange) bool {
373	return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd
374}
375
376// splitRanges splits the matched ranges so that a single match range has a
377// monotonically increasing source range (indicating a single, potential
378// instance of the source in the target).
379func splitRanges(matched MatchRanges) []MatchRanges {
380	var matchedRanges []MatchRanges
381	mr := MatchRanges{matched[0]}
382	for i := 1; i < len(matched); i++ {
383		if mr[len(mr)-1].SrcStart > matched[i].SrcStart {
384			matchedRanges = append(matchedRanges, mr)
385			mr = MatchRanges{matched[i]}
386		} else {
387			mr = append(mr, matched[i])
388		}
389	}
390	matchedRanges = append(matchedRanges, mr)
391	return matchedRanges
392}
393
394// mergeConsecutiveRanges goes through the matched ranges and merges
395// consecutive ranges. Two ranges are consecutive if the end of the previous
396// matched range and beginning of the next matched range overlap. "matched"
397// should have 1 or more MatchRanges, each with one or more MatchRange objects.
398func mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges {
399	mr := []MatchRanges{matched[0]}
400
401	// Convenience functions.
402	prevMatchedRange := func() MatchRanges {
403		return mr[len(mr)-1]
404	}
405	prevMatchedRangeLastElem := func() *MatchRange {
406		return prevMatchedRange()[len(prevMatchedRange())-1]
407	}
408
409	// This algorithm compares the start of each MatchRanges object to the
410	// end of the previous MatchRanges object. If they overlap, then it
411	// tries to combine them. Note that a 0 offset into a MatchRanges
412	// object (e.g., matched[i][0]) is its first MatchRange, which
413	// indicates the start of the whole matched range.
414NEXT:
415	for i := 1; i < len(matched); i++ {
416		if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart {
417			// Consecutive matched ranges overlap. Merge them.
418			if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart {
419				// The last element of the previous matched
420				// range overlaps with the first element of the
421				// current matched range. Concatenate them.
422				if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd {
423					prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd
424					prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd
425				}
426				mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...)
427				continue
428			}
429
430			for j := 1; j < len(matched[i]); j++ {
431				// Find the positions in the ranges where the
432				// tail end of the previous matched range
433				// overlaps with the start of the next matched
434				// range.
435				for k := len(prevMatchedRange()) - 1; k > 0; k-- {
436					if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart &&
437						prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart {
438						// Append the next range to the previous range.
439						if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart {
440							// Coalesce the ranges.
441							prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd
442							prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd
443						}
444						mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...)
445						continue NEXT
446					}
447				}
448			}
449		}
450		mr = append(mr, matched[i])
451	}
452	return mr
453}
454
455// coalesceMatchRanges coalesces overlapping match ranges into a single
456// contiguous match range.
457func coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges {
458	coalesced := MatchRanges{matchedRanges[0]}
459	for i := 1; i < len(matchedRanges); i++ {
460		c := coalesced[len(coalesced)-1]
461		mr := matchedRanges[i]
462
463		if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart {
464			var se, ts, te int
465			if mr.SrcEnd > c.SrcEnd {
466				se = mr.SrcEnd
467			} else {
468				se = c.SrcEnd
469			}
470			if mr.TargetStart < c.TargetStart {
471				ts = mr.TargetStart
472			} else {
473				ts = c.TargetStart
474			}
475			if mr.TargetEnd > c.TargetEnd {
476				te = mr.TargetEnd
477			} else {
478				te = c.TargetEnd
479			}
480			coalesced[len(coalesced)-1] = &MatchRange{
481				SrcStart:    c.SrcStart,
482				SrcEnd:      se,
483				TargetStart: ts,
484				TargetEnd:   te,
485			}
486		} else {
487			coalesced = append(coalesced, mr)
488		}
489	}
490	return coalesced
491}
492