stringclassifier/searchset/searchset.go

*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
*46c4c49dSIbrahim Kanouche// limitations under the License.
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Package searchset generates hashes for all substrings of a text. Potential
*46c4c49dSIbrahim Kanouche// matches between two SearchSet objects can then be determined quickly.
*46c4c49dSIbrahim Kanouche// Generating the hashes can be expensive, so it's best to perform it once. If
*46c4c49dSIbrahim Kanouche// the text is part of a known corpus, then the SearchSet can be serialized and
*46c4c49dSIbrahim Kanouche// kept in an archive.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Matching occurs by "mapping" ranges from the source text into the target
*46c4c49dSIbrahim Kanouche// text but still retaining the source order:
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	SOURCE: |-----------------------------|
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	TARGET: |*****************************************|
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER:
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	  S:  |-[--]-----[---]------[----]------|
*46c4c49dSIbrahim Kanouche//	         /         |           \
*46c4c49dSIbrahim Kanouche//	      |---|   |---------|   |-------------|
*46c4c49dSIbrahim Kanouche//	  T: |*****************************************|
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Note that a single source range may match many different ranges in the
*46c4c49dSIbrahim Kanouche// target. The matching algorithm untangles these so that all matched ranges
*46c4c49dSIbrahim Kanouche// are in order with respect to the source ranges. This is especially important
*46c4c49dSIbrahim Kanouche// since the source text may occur more than once in the target text. The
*46c4c49dSIbrahim Kanouche// algorithm finds each potential occurrence of S in T and returns all as
*46c4c49dSIbrahim Kanouche// potential matched ranges.
*46c4c49dSIbrahim Kanouchepackage searchset
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanoucheimport (
*46c4c49dSIbrahim Kanouche	"encoding/gob"
*46c4c49dSIbrahim Kanouche	"fmt"
*46c4c49dSIbrahim Kanouche	"io"
*46c4c49dSIbrahim Kanouche	"sort"
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer"
*46c4c49dSIbrahim Kanouche)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// DefaultGranularity is the minimum size (in words) of the hash chunks.
*46c4c49dSIbrahim Kanoucheconst DefaultGranularity = 3
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// SearchSet is a set of substrings that have hashes associated with them,
*46c4c49dSIbrahim Kanouche// making it fast to search for potential matches.
*46c4c49dSIbrahim Kanouchetype SearchSet struct {
*46c4c49dSIbrahim Kanouche	// Tokens is a tokenized list of the original input string.
*46c4c49dSIbrahim Kanouche	Tokens tokenizer.Tokens
*46c4c49dSIbrahim Kanouche	// Hashes is a map of checksums to a range of tokens.
*46c4c49dSIbrahim Kanouche	Hashes tokenizer.Hash
*46c4c49dSIbrahim Kanouche	// Checksums is a list of checksums ordered from longest range to
*46c4c49dSIbrahim Kanouche	// shortest.
*46c4c49dSIbrahim Kanouche	Checksums []uint32
*46c4c49dSIbrahim Kanouche	// ChecksumRanges are the token ranges for the above checksums.
*46c4c49dSIbrahim Kanouche	ChecksumRanges tokenizer.TokenRanges
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	nodes []*node
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// node consists of a range of tokens along with the checksum for those tokens.
*46c4c49dSIbrahim Kanouchetype node struct {
*46c4c49dSIbrahim Kanouche	checksum uint32
*46c4c49dSIbrahim Kanouche	tokens   *tokenizer.TokenRange
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (n *node) String() string {
*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// New creates a new SearchSet object. It generates a hash for each substring of "s".
*46c4c49dSIbrahim Kanouchefunc New(s string, granularity int) *SearchSet {
*46c4c49dSIbrahim Kanouche	toks := tokenizer.Tokenize(s)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// Start generating hash values for all substrings within the text.
*46c4c49dSIbrahim Kanouche	h := make(tokenizer.Hash)
*46c4c49dSIbrahim Kanouche	checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int {
*46c4c49dSIbrahim Kanouche		if a < b {
*46c4c49dSIbrahim Kanouche			return a
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		return b
*46c4c49dSIbrahim Kanouche	}(len(toks), granularity))
*46c4c49dSIbrahim Kanouche	sset := &SearchSet{
*46c4c49dSIbrahim Kanouche		Tokens:         toks,
*46c4c49dSIbrahim Kanouche		Hashes:         h,
*46c4c49dSIbrahim Kanouche		Checksums:      checksums,
*46c4c49dSIbrahim Kanouche		ChecksumRanges: tokenRanges,
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	sset.GenerateNodeList()
*46c4c49dSIbrahim Kanouche	return sset
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// GenerateNodeList creates a node list out of the search set.
*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) GenerateNodeList() {
*46c4c49dSIbrahim Kanouche	if len(s.Tokens) == 0 {
*46c4c49dSIbrahim Kanouche		return
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	for i := 0; i < len(s.Checksums); i++ {
*46c4c49dSIbrahim Kanouche		s.nodes = append(s.nodes, &node{
*46c4c49dSIbrahim Kanouche			checksum: s.Checksums[i],
*46c4c49dSIbrahim Kanouche			tokens:   s.ChecksumRanges[i],
*46c4c49dSIbrahim Kanouche		})
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Serialize emits the SearchSet out so that it can be recreated at a later
*46c4c49dSIbrahim Kanouche// time.
*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) Serialize(w io.Writer) error {
*46c4c49dSIbrahim Kanouche	return gob.NewEncoder(w).Encode(s)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Deserialize reads a file with a serialized SearchSet in it and reconstructs it.
*46c4c49dSIbrahim Kanouchefunc Deserialize(r io.Reader, s *SearchSet) error {
*46c4c49dSIbrahim Kanouche	if err := gob.NewDecoder(r).Decode(&s); err != nil {
*46c4c49dSIbrahim Kanouche		return err
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	s.GenerateNodeList()
*46c4c49dSIbrahim Kanouche	return nil
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// MatchRange is the range within the source text that is a match to the range
*46c4c49dSIbrahim Kanouche// in the target text.
*46c4c49dSIbrahim Kanouchetype MatchRange struct {
*46c4c49dSIbrahim Kanouche	// Offsets into the source tokens.
*46c4c49dSIbrahim Kanouche	SrcStart, SrcEnd int
*46c4c49dSIbrahim Kanouche	// Offsets into the target tokens.
*46c4c49dSIbrahim Kanouche	TargetStart, TargetEnd int
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// in returns true if the start and end are enclosed in the match range.
*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) in(start, end int) bool {
*46c4c49dSIbrahim Kanouche	return start >= m.TargetStart && end <= m.TargetEnd
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) String() string {
*46c4c49dSIbrahim Kanouche	return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
*46c4c49dSIbrahim Kanouche// increasing in value and indicate a single potential occurrence of the source
*46c4c49dSIbrahim Kanouche// text in the target text.
*46c4c49dSIbrahim Kanouchetype MatchRanges []*MatchRange
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Len() int      { return len(m) }
*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Less(i, j int) bool {
*46c4c49dSIbrahim Kanouche	if m[i].TargetStart < m[j].TargetStart {
*46c4c49dSIbrahim Kanouche		return true
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// TargetRange is the start and stop token offsets into the target text.
*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
*46c4c49dSIbrahim Kanouche	start = target.Tokens[m[0].TargetStart].Offset
*46c4c49dSIbrahim Kanouche	end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text)
*46c4c49dSIbrahim Kanouche	return start, end
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Size is the number of source tokens that were matched.
*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Size() int {
*46c4c49dSIbrahim Kanouche	sum := 0
*46c4c49dSIbrahim Kanouche	for _, mr := range m {
*46c4c49dSIbrahim Kanouche		sum += mr.SrcEnd - mr.SrcStart
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return sum
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// FindPotentialMatches returns the ranges in the target (unknown) text that
*46c4c49dSIbrahim Kanouche// are best potential matches to the source (known) text.
*46c4c49dSIbrahim Kanouchefunc FindPotentialMatches(src, target *SearchSet) []MatchRanges {
*46c4c49dSIbrahim Kanouche	matchedRanges := getMatchedRanges(src, target)
*46c4c49dSIbrahim Kanouche	if len(matchedRanges) == 0 {
*46c4c49dSIbrahim Kanouche		return nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// Cleanup the matching ranges so that we get the longest contiguous ranges.
*46c4c49dSIbrahim Kanouche	for i := 0; i < len(matchedRanges); i++ {
*46c4c49dSIbrahim Kanouche		matchedRanges[i] = coalesceMatchRanges(matchedRanges[i])
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return matchedRanges
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// getMatchedRanges finds the ranges in the target text that match the source
*46c4c49dSIbrahim Kanouche// text. There can be multiple occurrences of the source text within the target
*46c4c49dSIbrahim Kanouche// text. Each separate occurrence is an entry in the returned slice.
*46c4c49dSIbrahim Kanouchefunc getMatchedRanges(src, target *SearchSet) []MatchRanges {
*46c4c49dSIbrahim Kanouche	matched := targetMatchedRanges(src, target)
*46c4c49dSIbrahim Kanouche	if len(matched) == 0 {
*46c4c49dSIbrahim Kanouche		return nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	sort.Sort(matched)
*46c4c49dSIbrahim Kanouche	matched = untangleSourceRanges(matched)
*46c4c49dSIbrahim Kanouche	matchedRanges := splitRanges(matched)
*46c4c49dSIbrahim Kanouche	return mergeConsecutiveRanges(matchedRanges)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
*46c4c49dSIbrahim Kanouche	if len(mr) == 0 {
*46c4c49dSIbrahim Kanouche		return false
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	for _, tv := range tr {
*46c4c49dSIbrahim Kanouche		for _, mv := range mr {
*46c4c49dSIbrahim Kanouche			if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
*46c4c49dSIbrahim Kanouche				return true
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return false
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// targetMatchedRanges finds matching sequences in target and src ordered by target position
*46c4c49dSIbrahim Kanouchefunc targetMatchedRanges(src, target *SearchSet) MatchRanges {
*46c4c49dSIbrahim Kanouche	if src.nodes == nil {
*46c4c49dSIbrahim Kanouche		return nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var matched MatchRanges
*46c4c49dSIbrahim Kanouche	var previous *node
*46c4c49dSIbrahim Kanouche	var possible []MatchRanges
*46c4c49dSIbrahim Kanouche	for _, tgtNode := range target.nodes {
*46c4c49dSIbrahim Kanouche		sr, ok := src.Hashes[tgtNode.checksum]
*46c4c49dSIbrahim Kanouche		if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
*46c4c49dSIbrahim Kanouche			for _, r := range possible {
*46c4c49dSIbrahim Kanouche				matched = append(matched, r...)
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			possible = possible[:0]
*46c4c49dSIbrahim Kanouche			previous = nil
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		if !ok {
*46c4c49dSIbrahim Kanouche			// There isn't a match in the source.
*46c4c49dSIbrahim Kanouche			continue
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		// Maps index within `possible` to the slice of ranges extended by a new range
*46c4c49dSIbrahim Kanouche		extended := make(map[int]*MatchRanges)
*46c4c49dSIbrahim Kanouche		// Go over the set of source ranges growing lists of `possible` match ranges.
*46c4c49dSIbrahim Kanouche		tv := tgtNode.tokens
*46c4c49dSIbrahim Kanouche		for _, sv := range sr {
*46c4c49dSIbrahim Kanouche			r := &MatchRange{
*46c4c49dSIbrahim Kanouche				SrcStart:    sv.Start,
*46c4c49dSIbrahim Kanouche				SrcEnd:      sv.End,
*46c4c49dSIbrahim Kanouche				TargetStart: tv.Start,
*46c4c49dSIbrahim Kanouche				TargetEnd:   tv.End,
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			found := false
*46c4c49dSIbrahim Kanouche			// Grow or extend each abutting `possible` match range.
*46c4c49dSIbrahim Kanouche			for i, p := range possible {
*46c4c49dSIbrahim Kanouche				last := p[len(p)-1]
*46c4c49dSIbrahim Kanouche				if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
*46c4c49dSIbrahim Kanouche					found = true
*46c4c49dSIbrahim Kanouche					possible[i] = append(possible[i], r)
*46c4c49dSIbrahim Kanouche					extended[i] = &possible[i]
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			if !found {
*46c4c49dSIbrahim Kanouche				// Did not abut any existing ranges, start a new `possible` match range.
*46c4c49dSIbrahim Kanouche				mrs := make(MatchRanges, 0, 2)
*46c4c49dSIbrahim Kanouche				mrs = append(mrs, r)
*46c4c49dSIbrahim Kanouche				possible = append(possible, mrs)
*46c4c49dSIbrahim Kanouche				extended[len(possible)-1] = &possible[len(possible)-1]
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		if len(extended) < len(possible) {
*46c4c49dSIbrahim Kanouche			// Ranges not extended--add to `matched` if not included in other range.
*46c4c49dSIbrahim Kanouche			for i := 0; i < len(possible); {
*46c4c49dSIbrahim Kanouche				_, updated := extended[i]
*46c4c49dSIbrahim Kanouche				if updated {
*46c4c49dSIbrahim Kanouche					i++ // Keep in `possible` and advance to next index.
*46c4c49dSIbrahim Kanouche					continue
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche				p1 := possible[i]
*46c4c49dSIbrahim Kanouche				found := false // whether found as subrange of another `possible` match.
*46c4c49dSIbrahim Kanouche				for _, p2 := range extended {
*46c4c49dSIbrahim Kanouche					if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
*46c4c49dSIbrahim Kanouche						found = true
*46c4c49dSIbrahim Kanouche						break
*46c4c49dSIbrahim Kanouche					}
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche				if !found {
*46c4c49dSIbrahim Kanouche					matched = append(matched, p1...)
*46c4c49dSIbrahim Kanouche				} // else included in other match.
*46c4c49dSIbrahim Kanouche				// Finished -- delete from `possible` and continue from same index.
*46c4c49dSIbrahim Kanouche				possible = append(possible[:i], possible[i+1:]...)
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		previous = tgtNode
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	// At end of file, terminate all `possible` match ranges.
*46c4c49dSIbrahim Kanouche	for i := 0; i < len(possible); i++ {
*46c4c49dSIbrahim Kanouche		p1 := possible[i]
*46c4c49dSIbrahim Kanouche		found := false // whether found as subrange of another `possible` match.
*46c4c49dSIbrahim Kanouche		for j := i + 1; j < len(possible); {
*46c4c49dSIbrahim Kanouche			p2 := possible[j]
*46c4c49dSIbrahim Kanouche			if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
*46c4c49dSIbrahim Kanouche				// Delete later sub-ranges included in this range.
*46c4c49dSIbrahim Kanouche				possible = append(possible[:j], possible[j+1:]...)
*46c4c49dSIbrahim Kanouche				continue
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			// Skip if subrange of a later range
*46c4c49dSIbrahim Kanouche			if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
*46c4c49dSIbrahim Kanouche				found = true
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			j++
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		if !found {
*46c4c49dSIbrahim Kanouche			matched = append(matched, p1...)
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return matched
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// untangleSourceRanges goes through the ranges and removes any whose source
*46c4c49dSIbrahim Kanouche// ranges are "out of order". A source range is "out of order" if the source
*46c4c49dSIbrahim Kanouche// range is out of sequence with the source ranges before and after it. This
*46c4c49dSIbrahim Kanouche// happens when more than one source range maps to the same target range.
*46c4c49dSIbrahim Kanouche// E.g.:
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	   SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137
*46c4c49dSIbrahim Kanouche//	1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143
*46c4c49dSIbrahim Kanouche//	2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143
*46c4c49dSIbrahim Kanouche//	   SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Here (1) is out of order, because the source range [12, 17) is out of
*46c4c49dSIbrahim Kanouche// sequence with the surrounding source sequences, but [32, 37) is.
*46c4c49dSIbrahim Kanouchefunc untangleSourceRanges(matched MatchRanges) MatchRanges {
*46c4c49dSIbrahim Kanouche	mr := MatchRanges{matched[0]}
*46c4c49dSIbrahim KanoucheNEXT:
*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
*46c4c49dSIbrahim Kanouche		if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd {
*46c4c49dSIbrahim Kanouche			// The matched range has already been added.
*46c4c49dSIbrahim Kanouche			continue
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) {
*46c4c49dSIbrahim Kanouche			// A sequence of ranges match the same target range.
*46c4c49dSIbrahim Kanouche			// Find the first one that has a source range greater
*46c4c49dSIbrahim Kanouche			// than the currently matched range. Omit all others.
*46c4c49dSIbrahim Kanouche			if matched[i].SrcStart > mr[len(mr)-1].SrcStart {
*46c4c49dSIbrahim Kanouche				mr = append(mr, matched[i])
*46c4c49dSIbrahim Kanouche				continue
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche			for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ {
*46c4c49dSIbrahim Kanouche				// Check subsequent ranges to see if we can
*46c4c49dSIbrahim Kanouche				// find one that matches in the correct order.
*46c4c49dSIbrahim Kanouche				if matched[j].SrcStart > mr[len(mr)-1].SrcStart {
*46c4c49dSIbrahim Kanouche					mr = append(mr, matched[j])
*46c4c49dSIbrahim Kanouche					i = j
*46c4c49dSIbrahim Kanouche					continue NEXT
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		mr = append(mr, matched[i])
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return mr
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// equalTargetRange returns true if the two MatchRange's cover the same target range.
*46c4c49dSIbrahim Kanouchefunc equalTargetRange(this, that *MatchRange) bool {
*46c4c49dSIbrahim Kanouche	return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// splitRanges splits the matched ranges so that a single match range has a
*46c4c49dSIbrahim Kanouche// monotonically increasing source range (indicating a single, potential
*46c4c49dSIbrahim Kanouche// instance of the source in the target).
*46c4c49dSIbrahim Kanouchefunc splitRanges(matched MatchRanges) []MatchRanges {
*46c4c49dSIbrahim Kanouche	var matchedRanges []MatchRanges
*46c4c49dSIbrahim Kanouche	mr := MatchRanges{matched[0]}
*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
*46c4c49dSIbrahim Kanouche		if mr[len(mr)-1].SrcStart > matched[i].SrcStart {
*46c4c49dSIbrahim Kanouche			matchedRanges = append(matchedRanges, mr)
*46c4c49dSIbrahim Kanouche			mr = MatchRanges{matched[i]}
*46c4c49dSIbrahim Kanouche		} else {
*46c4c49dSIbrahim Kanouche			mr = append(mr, matched[i])
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	matchedRanges = append(matchedRanges, mr)
*46c4c49dSIbrahim Kanouche	return matchedRanges
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// mergeConsecutiveRanges goes through the matched ranges and merges
*46c4c49dSIbrahim Kanouche// consecutive ranges. Two ranges are consecutive if the end of the previous
*46c4c49dSIbrahim Kanouche// matched range and beginning of the next matched range overlap. "matched"
*46c4c49dSIbrahim Kanouche// should have 1 or more MatchRanges, each with one or more MatchRange objects.
*46c4c49dSIbrahim Kanouchefunc mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges {
*46c4c49dSIbrahim Kanouche	mr := []MatchRanges{matched[0]}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// Convenience functions.
*46c4c49dSIbrahim Kanouche	prevMatchedRange := func() MatchRanges {
*46c4c49dSIbrahim Kanouche		return mr[len(mr)-1]
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	prevMatchedRangeLastElem := func() *MatchRange {
*46c4c49dSIbrahim Kanouche		return prevMatchedRange()[len(prevMatchedRange())-1]
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// This algorithm compares the start of each MatchRanges object to the
*46c4c49dSIbrahim Kanouche	// end of the previous MatchRanges object. If they overlap, then it
*46c4c49dSIbrahim Kanouche	// tries to combine them. Note that a 0 offset into a MatchRanges
*46c4c49dSIbrahim Kanouche	// object (e.g., matched[i][0]) is its first MatchRange, which
*46c4c49dSIbrahim Kanouche	// indicates the start of the whole matched range.
*46c4c49dSIbrahim KanoucheNEXT:
*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matched); i++ {
*46c4c49dSIbrahim Kanouche		if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart {
*46c4c49dSIbrahim Kanouche			// Consecutive matched ranges overlap. Merge them.
*46c4c49dSIbrahim Kanouche			if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart {
*46c4c49dSIbrahim Kanouche				// The last element of the previous matched
*46c4c49dSIbrahim Kanouche				// range overlaps with the first element of the
*46c4c49dSIbrahim Kanouche				// current matched range. Concatenate them.
*46c4c49dSIbrahim Kanouche				if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd {
*46c4c49dSIbrahim Kanouche					prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd
*46c4c49dSIbrahim Kanouche					prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche				mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...)
*46c4c49dSIbrahim Kanouche				continue
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche			for j := 1; j < len(matched[i]); j++ {
*46c4c49dSIbrahim Kanouche				// Find the positions in the ranges where the
*46c4c49dSIbrahim Kanouche				// tail end of the previous matched range
*46c4c49dSIbrahim Kanouche				// overlaps with the start of the next matched
*46c4c49dSIbrahim Kanouche				// range.
*46c4c49dSIbrahim Kanouche				for k := len(prevMatchedRange()) - 1; k > 0; k-- {
*46c4c49dSIbrahim Kanouche					if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart &&
*46c4c49dSIbrahim Kanouche						prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart {
*46c4c49dSIbrahim Kanouche						// Append the next range to the previous range.
*46c4c49dSIbrahim Kanouche						if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart {
*46c4c49dSIbrahim Kanouche							// Coalesce the ranges.
*46c4c49dSIbrahim Kanouche							prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd
*46c4c49dSIbrahim Kanouche							prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd
*46c4c49dSIbrahim Kanouche						}
*46c4c49dSIbrahim Kanouche						mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...)
*46c4c49dSIbrahim Kanouche						continue NEXT
*46c4c49dSIbrahim Kanouche					}
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		mr = append(mr, matched[i])
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return mr
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// coalesceMatchRanges coalesces overlapping match ranges into a single
*46c4c49dSIbrahim Kanouche// contiguous match range.
*46c4c49dSIbrahim Kanouchefunc coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges {
*46c4c49dSIbrahim Kanouche	coalesced := MatchRanges{matchedRanges[0]}
*46c4c49dSIbrahim Kanouche	for i := 1; i < len(matchedRanges); i++ {
*46c4c49dSIbrahim Kanouche		c := coalesced[len(coalesced)-1]
*46c4c49dSIbrahim Kanouche		mr := matchedRanges[i]
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart {
*46c4c49dSIbrahim Kanouche			var se, ts, te int
*46c4c49dSIbrahim Kanouche			if mr.SrcEnd > c.SrcEnd {
*46c4c49dSIbrahim Kanouche				se = mr.SrcEnd
*46c4c49dSIbrahim Kanouche			} else {
*46c4c49dSIbrahim Kanouche				se = c.SrcEnd
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			if mr.TargetStart < c.TargetStart {
*46c4c49dSIbrahim Kanouche				ts = mr.TargetStart
*46c4c49dSIbrahim Kanouche			} else {
*46c4c49dSIbrahim Kanouche				ts = c.TargetStart
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			if mr.TargetEnd > c.TargetEnd {
*46c4c49dSIbrahim Kanouche				te = mr.TargetEnd
*46c4c49dSIbrahim Kanouche			} else {
*46c4c49dSIbrahim Kanouche				te = c.TargetEnd
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			coalesced[len(coalesced)-1] = &MatchRange{
*46c4c49dSIbrahim Kanouche				SrcStart:    c.SrcStart,
*46c4c49dSIbrahim Kanouche				SrcEnd:      se,
*46c4c49dSIbrahim Kanouche				TargetStart: ts,
*46c4c49dSIbrahim Kanouche				TargetEnd:   te,
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		} else {
*46c4c49dSIbrahim Kanouche			coalesced = append(coalesced, mr)
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return coalesced
*46c4c49dSIbrahim Kanouche}