1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package searchset generates hashes for all substrings of a text. Potential 16*46c4c49dSIbrahim Kanouche// matches between two SearchSet objects can then be determined quickly. 17*46c4c49dSIbrahim Kanouche// Generating the hashes can be expensive, so it's best to perform it once. If 18*46c4c49dSIbrahim Kanouche// the text is part of a known corpus, then the SearchSet can be serialized and 19*46c4c49dSIbrahim Kanouche// kept in an archive. 20*46c4c49dSIbrahim Kanouche// 21*46c4c49dSIbrahim Kanouche// Matching occurs by "mapping" ranges from the source text into the target 22*46c4c49dSIbrahim Kanouche// text but still retaining the source order: 23*46c4c49dSIbrahim Kanouche// 24*46c4c49dSIbrahim Kanouche// SOURCE: |-----------------------------| 25*46c4c49dSIbrahim Kanouche// 26*46c4c49dSIbrahim Kanouche// TARGET: |*****************************************| 27*46c4c49dSIbrahim Kanouche// 28*46c4c49dSIbrahim Kanouche// MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER: 29*46c4c49dSIbrahim Kanouche// 30*46c4c49dSIbrahim Kanouche// S: |-[--]-----[---]------[----]------| 31*46c4c49dSIbrahim Kanouche// / | \ 32*46c4c49dSIbrahim Kanouche// |---| |---------| |-------------| 33*46c4c49dSIbrahim Kanouche// T: |*****************************************| 34*46c4c49dSIbrahim Kanouche// 35*46c4c49dSIbrahim Kanouche// Note that a single source range may match many different ranges in the 36*46c4c49dSIbrahim Kanouche// target. The matching algorithm untangles these so that all matched ranges 37*46c4c49dSIbrahim Kanouche// are in order with respect to the source ranges. This is especially important 38*46c4c49dSIbrahim Kanouche// since the source text may occur more than once in the target text. The 39*46c4c49dSIbrahim Kanouche// algorithm finds each potential occurrence of S in T and returns all as 40*46c4c49dSIbrahim Kanouche// potential matched ranges. 41*46c4c49dSIbrahim Kanouchepackage searchset 42*46c4c49dSIbrahim Kanouche 43*46c4c49dSIbrahim Kanoucheimport ( 44*46c4c49dSIbrahim Kanouche "encoding/gob" 45*46c4c49dSIbrahim Kanouche "fmt" 46*46c4c49dSIbrahim Kanouche "io" 47*46c4c49dSIbrahim Kanouche "sort" 48*46c4c49dSIbrahim Kanouche 49*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer" 50*46c4c49dSIbrahim Kanouche) 51*46c4c49dSIbrahim Kanouche 52*46c4c49dSIbrahim Kanouche// DefaultGranularity is the minimum size (in words) of the hash chunks. 53*46c4c49dSIbrahim Kanoucheconst DefaultGranularity = 3 54*46c4c49dSIbrahim Kanouche 55*46c4c49dSIbrahim Kanouche// SearchSet is a set of substrings that have hashes associated with them, 56*46c4c49dSIbrahim Kanouche// making it fast to search for potential matches. 57*46c4c49dSIbrahim Kanouchetype SearchSet struct { 58*46c4c49dSIbrahim Kanouche // Tokens is a tokenized list of the original input string. 59*46c4c49dSIbrahim Kanouche Tokens tokenizer.Tokens 60*46c4c49dSIbrahim Kanouche // Hashes is a map of checksums to a range of tokens. 61*46c4c49dSIbrahim Kanouche Hashes tokenizer.Hash 62*46c4c49dSIbrahim Kanouche // Checksums is a list of checksums ordered from longest range to 63*46c4c49dSIbrahim Kanouche // shortest. 64*46c4c49dSIbrahim Kanouche Checksums []uint32 65*46c4c49dSIbrahim Kanouche // ChecksumRanges are the token ranges for the above checksums. 66*46c4c49dSIbrahim Kanouche ChecksumRanges tokenizer.TokenRanges 67*46c4c49dSIbrahim Kanouche 68*46c4c49dSIbrahim Kanouche nodes []*node 69*46c4c49dSIbrahim Kanouche} 70*46c4c49dSIbrahim Kanouche 71*46c4c49dSIbrahim Kanouche// node consists of a range of tokens along with the checksum for those tokens. 72*46c4c49dSIbrahim Kanouchetype node struct { 73*46c4c49dSIbrahim Kanouche checksum uint32 74*46c4c49dSIbrahim Kanouche tokens *tokenizer.TokenRange 75*46c4c49dSIbrahim Kanouche} 76*46c4c49dSIbrahim Kanouche 77*46c4c49dSIbrahim Kanouchefunc (n *node) String() string { 78*46c4c49dSIbrahim Kanouche return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End) 79*46c4c49dSIbrahim Kanouche} 80*46c4c49dSIbrahim Kanouche 81*46c4c49dSIbrahim Kanouche// New creates a new SearchSet object. It generates a hash for each substring of "s". 82*46c4c49dSIbrahim Kanouchefunc New(s string, granularity int) *SearchSet { 83*46c4c49dSIbrahim Kanouche toks := tokenizer.Tokenize(s) 84*46c4c49dSIbrahim Kanouche 85*46c4c49dSIbrahim Kanouche // Start generating hash values for all substrings within the text. 86*46c4c49dSIbrahim Kanouche h := make(tokenizer.Hash) 87*46c4c49dSIbrahim Kanouche checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int { 88*46c4c49dSIbrahim Kanouche if a < b { 89*46c4c49dSIbrahim Kanouche return a 90*46c4c49dSIbrahim Kanouche } 91*46c4c49dSIbrahim Kanouche return b 92*46c4c49dSIbrahim Kanouche }(len(toks), granularity)) 93*46c4c49dSIbrahim Kanouche sset := &SearchSet{ 94*46c4c49dSIbrahim Kanouche Tokens: toks, 95*46c4c49dSIbrahim Kanouche Hashes: h, 96*46c4c49dSIbrahim Kanouche Checksums: checksums, 97*46c4c49dSIbrahim Kanouche ChecksumRanges: tokenRanges, 98*46c4c49dSIbrahim Kanouche } 99*46c4c49dSIbrahim Kanouche sset.GenerateNodeList() 100*46c4c49dSIbrahim Kanouche return sset 101*46c4c49dSIbrahim Kanouche} 102*46c4c49dSIbrahim Kanouche 103*46c4c49dSIbrahim Kanouche// GenerateNodeList creates a node list out of the search set. 104*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) GenerateNodeList() { 105*46c4c49dSIbrahim Kanouche if len(s.Tokens) == 0 { 106*46c4c49dSIbrahim Kanouche return 107*46c4c49dSIbrahim Kanouche } 108*46c4c49dSIbrahim Kanouche 109*46c4c49dSIbrahim Kanouche for i := 0; i < len(s.Checksums); i++ { 110*46c4c49dSIbrahim Kanouche s.nodes = append(s.nodes, &node{ 111*46c4c49dSIbrahim Kanouche checksum: s.Checksums[i], 112*46c4c49dSIbrahim Kanouche tokens: s.ChecksumRanges[i], 113*46c4c49dSIbrahim Kanouche }) 114*46c4c49dSIbrahim Kanouche } 115*46c4c49dSIbrahim Kanouche} 116*46c4c49dSIbrahim Kanouche 117*46c4c49dSIbrahim Kanouche// Serialize emits the SearchSet out so that it can be recreated at a later 118*46c4c49dSIbrahim Kanouche// time. 119*46c4c49dSIbrahim Kanouchefunc (s *SearchSet) Serialize(w io.Writer) error { 120*46c4c49dSIbrahim Kanouche return gob.NewEncoder(w).Encode(s) 121*46c4c49dSIbrahim Kanouche} 122*46c4c49dSIbrahim Kanouche 123*46c4c49dSIbrahim Kanouche// Deserialize reads a file with a serialized SearchSet in it and reconstructs it. 124*46c4c49dSIbrahim Kanouchefunc Deserialize(r io.Reader, s *SearchSet) error { 125*46c4c49dSIbrahim Kanouche if err := gob.NewDecoder(r).Decode(&s); err != nil { 126*46c4c49dSIbrahim Kanouche return err 127*46c4c49dSIbrahim Kanouche } 128*46c4c49dSIbrahim Kanouche s.GenerateNodeList() 129*46c4c49dSIbrahim Kanouche return nil 130*46c4c49dSIbrahim Kanouche} 131*46c4c49dSIbrahim Kanouche 132*46c4c49dSIbrahim Kanouche// MatchRange is the range within the source text that is a match to the range 133*46c4c49dSIbrahim Kanouche// in the target text. 134*46c4c49dSIbrahim Kanouchetype MatchRange struct { 135*46c4c49dSIbrahim Kanouche // Offsets into the source tokens. 136*46c4c49dSIbrahim Kanouche SrcStart, SrcEnd int 137*46c4c49dSIbrahim Kanouche // Offsets into the target tokens. 138*46c4c49dSIbrahim Kanouche TargetStart, TargetEnd int 139*46c4c49dSIbrahim Kanouche} 140*46c4c49dSIbrahim Kanouche 141*46c4c49dSIbrahim Kanouche// in returns true if the start and end are enclosed in the match range. 142*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) in(start, end int) bool { 143*46c4c49dSIbrahim Kanouche return start >= m.TargetStart && end <= m.TargetEnd 144*46c4c49dSIbrahim Kanouche} 145*46c4c49dSIbrahim Kanouche 146*46c4c49dSIbrahim Kanouchefunc (m *MatchRange) String() string { 147*46c4c49dSIbrahim Kanouche return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd) 148*46c4c49dSIbrahim Kanouche} 149*46c4c49dSIbrahim Kanouche 150*46c4c49dSIbrahim Kanouche// MatchRanges is a list of "MatchRange"s. The ranges are monotonically 151*46c4c49dSIbrahim Kanouche// increasing in value and indicate a single potential occurrence of the source 152*46c4c49dSIbrahim Kanouche// text in the target text. 153*46c4c49dSIbrahim Kanouchetype MatchRanges []*MatchRange 154*46c4c49dSIbrahim Kanouche 155*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Len() int { return len(m) } 156*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 157*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Less(i, j int) bool { 158*46c4c49dSIbrahim Kanouche if m[i].TargetStart < m[j].TargetStart { 159*46c4c49dSIbrahim Kanouche return true 160*46c4c49dSIbrahim Kanouche } 161*46c4c49dSIbrahim Kanouche return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart 162*46c4c49dSIbrahim Kanouche} 163*46c4c49dSIbrahim Kanouche 164*46c4c49dSIbrahim Kanouche// TargetRange is the start and stop token offsets into the target text. 165*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) TargetRange(target *SearchSet) (start, end int) { 166*46c4c49dSIbrahim Kanouche start = target.Tokens[m[0].TargetStart].Offset 167*46c4c49dSIbrahim Kanouche end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text) 168*46c4c49dSIbrahim Kanouche return start, end 169*46c4c49dSIbrahim Kanouche} 170*46c4c49dSIbrahim Kanouche 171*46c4c49dSIbrahim Kanouche// Size is the number of source tokens that were matched. 172*46c4c49dSIbrahim Kanouchefunc (m MatchRanges) Size() int { 173*46c4c49dSIbrahim Kanouche sum := 0 174*46c4c49dSIbrahim Kanouche for _, mr := range m { 175*46c4c49dSIbrahim Kanouche sum += mr.SrcEnd - mr.SrcStart 176*46c4c49dSIbrahim Kanouche } 177*46c4c49dSIbrahim Kanouche return sum 178*46c4c49dSIbrahim Kanouche} 179*46c4c49dSIbrahim Kanouche 180*46c4c49dSIbrahim Kanouche// FindPotentialMatches returns the ranges in the target (unknown) text that 181*46c4c49dSIbrahim Kanouche// are best potential matches to the source (known) text. 182*46c4c49dSIbrahim Kanouchefunc FindPotentialMatches(src, target *SearchSet) []MatchRanges { 183*46c4c49dSIbrahim Kanouche matchedRanges := getMatchedRanges(src, target) 184*46c4c49dSIbrahim Kanouche if len(matchedRanges) == 0 { 185*46c4c49dSIbrahim Kanouche return nil 186*46c4c49dSIbrahim Kanouche } 187*46c4c49dSIbrahim Kanouche 188*46c4c49dSIbrahim Kanouche // Cleanup the matching ranges so that we get the longest contiguous ranges. 189*46c4c49dSIbrahim Kanouche for i := 0; i < len(matchedRanges); i++ { 190*46c4c49dSIbrahim Kanouche matchedRanges[i] = coalesceMatchRanges(matchedRanges[i]) 191*46c4c49dSIbrahim Kanouche } 192*46c4c49dSIbrahim Kanouche return matchedRanges 193*46c4c49dSIbrahim Kanouche} 194*46c4c49dSIbrahim Kanouche 195*46c4c49dSIbrahim Kanouche// getMatchedRanges finds the ranges in the target text that match the source 196*46c4c49dSIbrahim Kanouche// text. There can be multiple occurrences of the source text within the target 197*46c4c49dSIbrahim Kanouche// text. Each separate occurrence is an entry in the returned slice. 198*46c4c49dSIbrahim Kanouchefunc getMatchedRanges(src, target *SearchSet) []MatchRanges { 199*46c4c49dSIbrahim Kanouche matched := targetMatchedRanges(src, target) 200*46c4c49dSIbrahim Kanouche if len(matched) == 0 { 201*46c4c49dSIbrahim Kanouche return nil 202*46c4c49dSIbrahim Kanouche } 203*46c4c49dSIbrahim Kanouche sort.Sort(matched) 204*46c4c49dSIbrahim Kanouche matched = untangleSourceRanges(matched) 205*46c4c49dSIbrahim Kanouche matchedRanges := splitRanges(matched) 206*46c4c49dSIbrahim Kanouche return mergeConsecutiveRanges(matchedRanges) 207*46c4c49dSIbrahim Kanouche} 208*46c4c49dSIbrahim Kanouche 209*46c4c49dSIbrahim Kanouchefunc extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool { 210*46c4c49dSIbrahim Kanouche if len(mr) == 0 { 211*46c4c49dSIbrahim Kanouche return false 212*46c4c49dSIbrahim Kanouche } 213*46c4c49dSIbrahim Kanouche for _, tv := range tr { 214*46c4c49dSIbrahim Kanouche for _, mv := range mr { 215*46c4c49dSIbrahim Kanouche if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd { 216*46c4c49dSIbrahim Kanouche return true 217*46c4c49dSIbrahim Kanouche } 218*46c4c49dSIbrahim Kanouche } 219*46c4c49dSIbrahim Kanouche } 220*46c4c49dSIbrahim Kanouche return false 221*46c4c49dSIbrahim Kanouche} 222*46c4c49dSIbrahim Kanouche 223*46c4c49dSIbrahim Kanouche// targetMatchedRanges finds matching sequences in target and src ordered by target position 224*46c4c49dSIbrahim Kanouchefunc targetMatchedRanges(src, target *SearchSet) MatchRanges { 225*46c4c49dSIbrahim Kanouche if src.nodes == nil { 226*46c4c49dSIbrahim Kanouche return nil 227*46c4c49dSIbrahim Kanouche } 228*46c4c49dSIbrahim Kanouche 229*46c4c49dSIbrahim Kanouche var matched MatchRanges 230*46c4c49dSIbrahim Kanouche var previous *node 231*46c4c49dSIbrahim Kanouche var possible []MatchRanges 232*46c4c49dSIbrahim Kanouche for _, tgtNode := range target.nodes { 233*46c4c49dSIbrahim Kanouche sr, ok := src.Hashes[tgtNode.checksum] 234*46c4c49dSIbrahim Kanouche if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) { 235*46c4c49dSIbrahim Kanouche for _, r := range possible { 236*46c4c49dSIbrahim Kanouche matched = append(matched, r...) 237*46c4c49dSIbrahim Kanouche } 238*46c4c49dSIbrahim Kanouche possible = possible[:0] 239*46c4c49dSIbrahim Kanouche previous = nil 240*46c4c49dSIbrahim Kanouche } 241*46c4c49dSIbrahim Kanouche if !ok { 242*46c4c49dSIbrahim Kanouche // There isn't a match in the source. 243*46c4c49dSIbrahim Kanouche continue 244*46c4c49dSIbrahim Kanouche } 245*46c4c49dSIbrahim Kanouche 246*46c4c49dSIbrahim Kanouche // Maps index within `possible` to the slice of ranges extended by a new range 247*46c4c49dSIbrahim Kanouche extended := make(map[int]*MatchRanges) 248*46c4c49dSIbrahim Kanouche // Go over the set of source ranges growing lists of `possible` match ranges. 249*46c4c49dSIbrahim Kanouche tv := tgtNode.tokens 250*46c4c49dSIbrahim Kanouche for _, sv := range sr { 251*46c4c49dSIbrahim Kanouche r := &MatchRange{ 252*46c4c49dSIbrahim Kanouche SrcStart: sv.Start, 253*46c4c49dSIbrahim Kanouche SrcEnd: sv.End, 254*46c4c49dSIbrahim Kanouche TargetStart: tv.Start, 255*46c4c49dSIbrahim Kanouche TargetEnd: tv.End, 256*46c4c49dSIbrahim Kanouche } 257*46c4c49dSIbrahim Kanouche found := false 258*46c4c49dSIbrahim Kanouche // Grow or extend each abutting `possible` match range. 259*46c4c49dSIbrahim Kanouche for i, p := range possible { 260*46c4c49dSIbrahim Kanouche last := p[len(p)-1] 261*46c4c49dSIbrahim Kanouche if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd { 262*46c4c49dSIbrahim Kanouche found = true 263*46c4c49dSIbrahim Kanouche possible[i] = append(possible[i], r) 264*46c4c49dSIbrahim Kanouche extended[i] = &possible[i] 265*46c4c49dSIbrahim Kanouche } 266*46c4c49dSIbrahim Kanouche } 267*46c4c49dSIbrahim Kanouche if !found { 268*46c4c49dSIbrahim Kanouche // Did not abut any existing ranges, start a new `possible` match range. 269*46c4c49dSIbrahim Kanouche mrs := make(MatchRanges, 0, 2) 270*46c4c49dSIbrahim Kanouche mrs = append(mrs, r) 271*46c4c49dSIbrahim Kanouche possible = append(possible, mrs) 272*46c4c49dSIbrahim Kanouche extended[len(possible)-1] = &possible[len(possible)-1] 273*46c4c49dSIbrahim Kanouche } 274*46c4c49dSIbrahim Kanouche } 275*46c4c49dSIbrahim Kanouche if len(extended) < len(possible) { 276*46c4c49dSIbrahim Kanouche // Ranges not extended--add to `matched` if not included in other range. 277*46c4c49dSIbrahim Kanouche for i := 0; i < len(possible); { 278*46c4c49dSIbrahim Kanouche _, updated := extended[i] 279*46c4c49dSIbrahim Kanouche if updated { 280*46c4c49dSIbrahim Kanouche i++ // Keep in `possible` and advance to next index. 281*46c4c49dSIbrahim Kanouche continue 282*46c4c49dSIbrahim Kanouche } 283*46c4c49dSIbrahim Kanouche p1 := possible[i] 284*46c4c49dSIbrahim Kanouche found := false // whether found as subrange of another `possible` match. 285*46c4c49dSIbrahim Kanouche for _, p2 := range extended { 286*46c4c49dSIbrahim Kanouche if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart { 287*46c4c49dSIbrahim Kanouche found = true 288*46c4c49dSIbrahim Kanouche break 289*46c4c49dSIbrahim Kanouche } 290*46c4c49dSIbrahim Kanouche } 291*46c4c49dSIbrahim Kanouche if !found { 292*46c4c49dSIbrahim Kanouche matched = append(matched, p1...) 293*46c4c49dSIbrahim Kanouche } // else included in other match. 294*46c4c49dSIbrahim Kanouche // Finished -- delete from `possible` and continue from same index. 295*46c4c49dSIbrahim Kanouche possible = append(possible[:i], possible[i+1:]...) 296*46c4c49dSIbrahim Kanouche } 297*46c4c49dSIbrahim Kanouche } 298*46c4c49dSIbrahim Kanouche previous = tgtNode 299*46c4c49dSIbrahim Kanouche } 300*46c4c49dSIbrahim Kanouche // At end of file, terminate all `possible` match ranges. 301*46c4c49dSIbrahim Kanouche for i := 0; i < len(possible); i++ { 302*46c4c49dSIbrahim Kanouche p1 := possible[i] 303*46c4c49dSIbrahim Kanouche found := false // whether found as subrange of another `possible` match. 304*46c4c49dSIbrahim Kanouche for j := i + 1; j < len(possible); { 305*46c4c49dSIbrahim Kanouche p2 := possible[j] 306*46c4c49dSIbrahim Kanouche if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart { 307*46c4c49dSIbrahim Kanouche // Delete later sub-ranges included in this range. 308*46c4c49dSIbrahim Kanouche possible = append(possible[:j], possible[j+1:]...) 309*46c4c49dSIbrahim Kanouche continue 310*46c4c49dSIbrahim Kanouche } 311*46c4c49dSIbrahim Kanouche // Skip if subrange of a later range 312*46c4c49dSIbrahim Kanouche if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart { 313*46c4c49dSIbrahim Kanouche found = true 314*46c4c49dSIbrahim Kanouche } 315*46c4c49dSIbrahim Kanouche j++ 316*46c4c49dSIbrahim Kanouche } 317*46c4c49dSIbrahim Kanouche if !found { 318*46c4c49dSIbrahim Kanouche matched = append(matched, p1...) 319*46c4c49dSIbrahim Kanouche } 320*46c4c49dSIbrahim Kanouche } 321*46c4c49dSIbrahim Kanouche return matched 322*46c4c49dSIbrahim Kanouche} 323*46c4c49dSIbrahim Kanouche 324*46c4c49dSIbrahim Kanouche// untangleSourceRanges goes through the ranges and removes any whose source 325*46c4c49dSIbrahim Kanouche// ranges are "out of order". A source range is "out of order" if the source 326*46c4c49dSIbrahim Kanouche// range is out of sequence with the source ranges before and after it. This 327*46c4c49dSIbrahim Kanouche// happens when more than one source range maps to the same target range. 328*46c4c49dSIbrahim Kanouche// E.g.: 329*46c4c49dSIbrahim Kanouche// 330*46c4c49dSIbrahim Kanouche// SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137 331*46c4c49dSIbrahim Kanouche// 1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143 332*46c4c49dSIbrahim Kanouche// 2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143 333*46c4c49dSIbrahim Kanouche// SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146 334*46c4c49dSIbrahim Kanouche// 335*46c4c49dSIbrahim Kanouche// Here (1) is out of order, because the source range [12, 17) is out of 336*46c4c49dSIbrahim Kanouche// sequence with the surrounding source sequences, but [32, 37) is. 337*46c4c49dSIbrahim Kanouchefunc untangleSourceRanges(matched MatchRanges) MatchRanges { 338*46c4c49dSIbrahim Kanouche mr := MatchRanges{matched[0]} 339*46c4c49dSIbrahim KanoucheNEXT: 340*46c4c49dSIbrahim Kanouche for i := 1; i < len(matched); i++ { 341*46c4c49dSIbrahim Kanouche if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd { 342*46c4c49dSIbrahim Kanouche // The matched range has already been added. 343*46c4c49dSIbrahim Kanouche continue 344*46c4c49dSIbrahim Kanouche } 345*46c4c49dSIbrahim Kanouche 346*46c4c49dSIbrahim Kanouche if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) { 347*46c4c49dSIbrahim Kanouche // A sequence of ranges match the same target range. 348*46c4c49dSIbrahim Kanouche // Find the first one that has a source range greater 349*46c4c49dSIbrahim Kanouche // than the currently matched range. Omit all others. 350*46c4c49dSIbrahim Kanouche if matched[i].SrcStart > mr[len(mr)-1].SrcStart { 351*46c4c49dSIbrahim Kanouche mr = append(mr, matched[i]) 352*46c4c49dSIbrahim Kanouche continue 353*46c4c49dSIbrahim Kanouche } 354*46c4c49dSIbrahim Kanouche 355*46c4c49dSIbrahim Kanouche for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ { 356*46c4c49dSIbrahim Kanouche // Check subsequent ranges to see if we can 357*46c4c49dSIbrahim Kanouche // find one that matches in the correct order. 358*46c4c49dSIbrahim Kanouche if matched[j].SrcStart > mr[len(mr)-1].SrcStart { 359*46c4c49dSIbrahim Kanouche mr = append(mr, matched[j]) 360*46c4c49dSIbrahim Kanouche i = j 361*46c4c49dSIbrahim Kanouche continue NEXT 362*46c4c49dSIbrahim Kanouche } 363*46c4c49dSIbrahim Kanouche } 364*46c4c49dSIbrahim Kanouche } 365*46c4c49dSIbrahim Kanouche 366*46c4c49dSIbrahim Kanouche mr = append(mr, matched[i]) 367*46c4c49dSIbrahim Kanouche } 368*46c4c49dSIbrahim Kanouche return mr 369*46c4c49dSIbrahim Kanouche} 370*46c4c49dSIbrahim Kanouche 371*46c4c49dSIbrahim Kanouche// equalTargetRange returns true if the two MatchRange's cover the same target range. 372*46c4c49dSIbrahim Kanouchefunc equalTargetRange(this, that *MatchRange) bool { 373*46c4c49dSIbrahim Kanouche return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd 374*46c4c49dSIbrahim Kanouche} 375*46c4c49dSIbrahim Kanouche 376*46c4c49dSIbrahim Kanouche// splitRanges splits the matched ranges so that a single match range has a 377*46c4c49dSIbrahim Kanouche// monotonically increasing source range (indicating a single, potential 378*46c4c49dSIbrahim Kanouche// instance of the source in the target). 379*46c4c49dSIbrahim Kanouchefunc splitRanges(matched MatchRanges) []MatchRanges { 380*46c4c49dSIbrahim Kanouche var matchedRanges []MatchRanges 381*46c4c49dSIbrahim Kanouche mr := MatchRanges{matched[0]} 382*46c4c49dSIbrahim Kanouche for i := 1; i < len(matched); i++ { 383*46c4c49dSIbrahim Kanouche if mr[len(mr)-1].SrcStart > matched[i].SrcStart { 384*46c4c49dSIbrahim Kanouche matchedRanges = append(matchedRanges, mr) 385*46c4c49dSIbrahim Kanouche mr = MatchRanges{matched[i]} 386*46c4c49dSIbrahim Kanouche } else { 387*46c4c49dSIbrahim Kanouche mr = append(mr, matched[i]) 388*46c4c49dSIbrahim Kanouche } 389*46c4c49dSIbrahim Kanouche } 390*46c4c49dSIbrahim Kanouche matchedRanges = append(matchedRanges, mr) 391*46c4c49dSIbrahim Kanouche return matchedRanges 392*46c4c49dSIbrahim Kanouche} 393*46c4c49dSIbrahim Kanouche 394*46c4c49dSIbrahim Kanouche// mergeConsecutiveRanges goes through the matched ranges and merges 395*46c4c49dSIbrahim Kanouche// consecutive ranges. Two ranges are consecutive if the end of the previous 396*46c4c49dSIbrahim Kanouche// matched range and beginning of the next matched range overlap. "matched" 397*46c4c49dSIbrahim Kanouche// should have 1 or more MatchRanges, each with one or more MatchRange objects. 398*46c4c49dSIbrahim Kanouchefunc mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges { 399*46c4c49dSIbrahim Kanouche mr := []MatchRanges{matched[0]} 400*46c4c49dSIbrahim Kanouche 401*46c4c49dSIbrahim Kanouche // Convenience functions. 402*46c4c49dSIbrahim Kanouche prevMatchedRange := func() MatchRanges { 403*46c4c49dSIbrahim Kanouche return mr[len(mr)-1] 404*46c4c49dSIbrahim Kanouche } 405*46c4c49dSIbrahim Kanouche prevMatchedRangeLastElem := func() *MatchRange { 406*46c4c49dSIbrahim Kanouche return prevMatchedRange()[len(prevMatchedRange())-1] 407*46c4c49dSIbrahim Kanouche } 408*46c4c49dSIbrahim Kanouche 409*46c4c49dSIbrahim Kanouche // This algorithm compares the start of each MatchRanges object to the 410*46c4c49dSIbrahim Kanouche // end of the previous MatchRanges object. If they overlap, then it 411*46c4c49dSIbrahim Kanouche // tries to combine them. Note that a 0 offset into a MatchRanges 412*46c4c49dSIbrahim Kanouche // object (e.g., matched[i][0]) is its first MatchRange, which 413*46c4c49dSIbrahim Kanouche // indicates the start of the whole matched range. 414*46c4c49dSIbrahim KanoucheNEXT: 415*46c4c49dSIbrahim Kanouche for i := 1; i < len(matched); i++ { 416*46c4c49dSIbrahim Kanouche if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart { 417*46c4c49dSIbrahim Kanouche // Consecutive matched ranges overlap. Merge them. 418*46c4c49dSIbrahim Kanouche if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart { 419*46c4c49dSIbrahim Kanouche // The last element of the previous matched 420*46c4c49dSIbrahim Kanouche // range overlaps with the first element of the 421*46c4c49dSIbrahim Kanouche // current matched range. Concatenate them. 422*46c4c49dSIbrahim Kanouche if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd { 423*46c4c49dSIbrahim Kanouche prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd 424*46c4c49dSIbrahim Kanouche prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd 425*46c4c49dSIbrahim Kanouche } 426*46c4c49dSIbrahim Kanouche mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...) 427*46c4c49dSIbrahim Kanouche continue 428*46c4c49dSIbrahim Kanouche } 429*46c4c49dSIbrahim Kanouche 430*46c4c49dSIbrahim Kanouche for j := 1; j < len(matched[i]); j++ { 431*46c4c49dSIbrahim Kanouche // Find the positions in the ranges where the 432*46c4c49dSIbrahim Kanouche // tail end of the previous matched range 433*46c4c49dSIbrahim Kanouche // overlaps with the start of the next matched 434*46c4c49dSIbrahim Kanouche // range. 435*46c4c49dSIbrahim Kanouche for k := len(prevMatchedRange()) - 1; k > 0; k-- { 436*46c4c49dSIbrahim Kanouche if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart && 437*46c4c49dSIbrahim Kanouche prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart { 438*46c4c49dSIbrahim Kanouche // Append the next range to the previous range. 439*46c4c49dSIbrahim Kanouche if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart { 440*46c4c49dSIbrahim Kanouche // Coalesce the ranges. 441*46c4c49dSIbrahim Kanouche prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd 442*46c4c49dSIbrahim Kanouche prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd 443*46c4c49dSIbrahim Kanouche } 444*46c4c49dSIbrahim Kanouche mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...) 445*46c4c49dSIbrahim Kanouche continue NEXT 446*46c4c49dSIbrahim Kanouche } 447*46c4c49dSIbrahim Kanouche } 448*46c4c49dSIbrahim Kanouche } 449*46c4c49dSIbrahim Kanouche } 450*46c4c49dSIbrahim Kanouche mr = append(mr, matched[i]) 451*46c4c49dSIbrahim Kanouche } 452*46c4c49dSIbrahim Kanouche return mr 453*46c4c49dSIbrahim Kanouche} 454*46c4c49dSIbrahim Kanouche 455*46c4c49dSIbrahim Kanouche// coalesceMatchRanges coalesces overlapping match ranges into a single 456*46c4c49dSIbrahim Kanouche// contiguous match range. 457*46c4c49dSIbrahim Kanouchefunc coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges { 458*46c4c49dSIbrahim Kanouche coalesced := MatchRanges{matchedRanges[0]} 459*46c4c49dSIbrahim Kanouche for i := 1; i < len(matchedRanges); i++ { 460*46c4c49dSIbrahim Kanouche c := coalesced[len(coalesced)-1] 461*46c4c49dSIbrahim Kanouche mr := matchedRanges[i] 462*46c4c49dSIbrahim Kanouche 463*46c4c49dSIbrahim Kanouche if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart { 464*46c4c49dSIbrahim Kanouche var se, ts, te int 465*46c4c49dSIbrahim Kanouche if mr.SrcEnd > c.SrcEnd { 466*46c4c49dSIbrahim Kanouche se = mr.SrcEnd 467*46c4c49dSIbrahim Kanouche } else { 468*46c4c49dSIbrahim Kanouche se = c.SrcEnd 469*46c4c49dSIbrahim Kanouche } 470*46c4c49dSIbrahim Kanouche if mr.TargetStart < c.TargetStart { 471*46c4c49dSIbrahim Kanouche ts = mr.TargetStart 472*46c4c49dSIbrahim Kanouche } else { 473*46c4c49dSIbrahim Kanouche ts = c.TargetStart 474*46c4c49dSIbrahim Kanouche } 475*46c4c49dSIbrahim Kanouche if mr.TargetEnd > c.TargetEnd { 476*46c4c49dSIbrahim Kanouche te = mr.TargetEnd 477*46c4c49dSIbrahim Kanouche } else { 478*46c4c49dSIbrahim Kanouche te = c.TargetEnd 479*46c4c49dSIbrahim Kanouche } 480*46c4c49dSIbrahim Kanouche coalesced[len(coalesced)-1] = &MatchRange{ 481*46c4c49dSIbrahim Kanouche SrcStart: c.SrcStart, 482*46c4c49dSIbrahim Kanouche SrcEnd: se, 483*46c4c49dSIbrahim Kanouche TargetStart: ts, 484*46c4c49dSIbrahim Kanouche TargetEnd: te, 485*46c4c49dSIbrahim Kanouche } 486*46c4c49dSIbrahim Kanouche } else { 487*46c4c49dSIbrahim Kanouche coalesced = append(coalesced, mr) 488*46c4c49dSIbrahim Kanouche } 489*46c4c49dSIbrahim Kanouche } 490*46c4c49dSIbrahim Kanouche return coalesced 491*46c4c49dSIbrahim Kanouche} 492