1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package searchset generates hashes for all substrings of a text. Potential 16// matches between two SearchSet objects can then be determined quickly. 17// Generating the hashes can be expensive, so it's best to perform it once. If 18// the text is part of a known corpus, then the SearchSet can be serialized and 19// kept in an archive. 20// 21// Matching occurs by "mapping" ranges from the source text into the target 22// text but still retaining the source order: 23// 24// SOURCE: |-----------------------------| 25// 26// TARGET: |*****************************************| 27// 28// MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER: 29// 30// S: |-[--]-----[---]------[----]------| 31// / | \ 32// |---| |---------| |-------------| 33// T: |*****************************************| 34// 35// Note that a single source range may match many different ranges in the 36// target. The matching algorithm untangles these so that all matched ranges 37// are in order with respect to the source ranges. This is especially important 38// since the source text may occur more than once in the target text. The 39// algorithm finds each potential occurrence of S in T and returns all as 40// potential matched ranges. 41package searchset 42 43import ( 44 "encoding/gob" 45 "fmt" 46 "io" 47 "sort" 48 49 "github.com/google/licenseclassifier/stringclassifier/searchset/tokenizer" 50) 51 52// DefaultGranularity is the minimum size (in words) of the hash chunks. 53const DefaultGranularity = 3 54 55// SearchSet is a set of substrings that have hashes associated with them, 56// making it fast to search for potential matches. 57type SearchSet struct { 58 // Tokens is a tokenized list of the original input string. 59 Tokens tokenizer.Tokens 60 // Hashes is a map of checksums to a range of tokens. 61 Hashes tokenizer.Hash 62 // Checksums is a list of checksums ordered from longest range to 63 // shortest. 64 Checksums []uint32 65 // ChecksumRanges are the token ranges for the above checksums. 66 ChecksumRanges tokenizer.TokenRanges 67 68 nodes []*node 69} 70 71// node consists of a range of tokens along with the checksum for those tokens. 72type node struct { 73 checksum uint32 74 tokens *tokenizer.TokenRange 75} 76 77func (n *node) String() string { 78 return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End) 79} 80 81// New creates a new SearchSet object. It generates a hash for each substring of "s". 82func New(s string, granularity int) *SearchSet { 83 toks := tokenizer.Tokenize(s) 84 85 // Start generating hash values for all substrings within the text. 86 h := make(tokenizer.Hash) 87 checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int { 88 if a < b { 89 return a 90 } 91 return b 92 }(len(toks), granularity)) 93 sset := &SearchSet{ 94 Tokens: toks, 95 Hashes: h, 96 Checksums: checksums, 97 ChecksumRanges: tokenRanges, 98 } 99 sset.GenerateNodeList() 100 return sset 101} 102 103// GenerateNodeList creates a node list out of the search set. 104func (s *SearchSet) GenerateNodeList() { 105 if len(s.Tokens) == 0 { 106 return 107 } 108 109 for i := 0; i < len(s.Checksums); i++ { 110 s.nodes = append(s.nodes, &node{ 111 checksum: s.Checksums[i], 112 tokens: s.ChecksumRanges[i], 113 }) 114 } 115} 116 117// Serialize emits the SearchSet out so that it can be recreated at a later 118// time. 119func (s *SearchSet) Serialize(w io.Writer) error { 120 return gob.NewEncoder(w).Encode(s) 121} 122 123// Deserialize reads a file with a serialized SearchSet in it and reconstructs it. 124func Deserialize(r io.Reader, s *SearchSet) error { 125 if err := gob.NewDecoder(r).Decode(&s); err != nil { 126 return err 127 } 128 s.GenerateNodeList() 129 return nil 130} 131 132// MatchRange is the range within the source text that is a match to the range 133// in the target text. 134type MatchRange struct { 135 // Offsets into the source tokens. 136 SrcStart, SrcEnd int 137 // Offsets into the target tokens. 138 TargetStart, TargetEnd int 139} 140 141// in returns true if the start and end are enclosed in the match range. 142func (m *MatchRange) in(start, end int) bool { 143 return start >= m.TargetStart && end <= m.TargetEnd 144} 145 146func (m *MatchRange) String() string { 147 return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd) 148} 149 150// MatchRanges is a list of "MatchRange"s. The ranges are monotonically 151// increasing in value and indicate a single potential occurrence of the source 152// text in the target text. 153type MatchRanges []*MatchRange 154 155func (m MatchRanges) Len() int { return len(m) } 156func (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 157func (m MatchRanges) Less(i, j int) bool { 158 if m[i].TargetStart < m[j].TargetStart { 159 return true 160 } 161 return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart 162} 163 164// TargetRange is the start and stop token offsets into the target text. 165func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) { 166 start = target.Tokens[m[0].TargetStart].Offset 167 end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text) 168 return start, end 169} 170 171// Size is the number of source tokens that were matched. 172func (m MatchRanges) Size() int { 173 sum := 0 174 for _, mr := range m { 175 sum += mr.SrcEnd - mr.SrcStart 176 } 177 return sum 178} 179 180// FindPotentialMatches returns the ranges in the target (unknown) text that 181// are best potential matches to the source (known) text. 182func FindPotentialMatches(src, target *SearchSet) []MatchRanges { 183 matchedRanges := getMatchedRanges(src, target) 184 if len(matchedRanges) == 0 { 185 return nil 186 } 187 188 // Cleanup the matching ranges so that we get the longest contiguous ranges. 189 for i := 0; i < len(matchedRanges); i++ { 190 matchedRanges[i] = coalesceMatchRanges(matchedRanges[i]) 191 } 192 return matchedRanges 193} 194 195// getMatchedRanges finds the ranges in the target text that match the source 196// text. There can be multiple occurrences of the source text within the target 197// text. Each separate occurrence is an entry in the returned slice. 198func getMatchedRanges(src, target *SearchSet) []MatchRanges { 199 matched := targetMatchedRanges(src, target) 200 if len(matched) == 0 { 201 return nil 202 } 203 sort.Sort(matched) 204 matched = untangleSourceRanges(matched) 205 matchedRanges := splitRanges(matched) 206 return mergeConsecutiveRanges(matchedRanges) 207} 208 209func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool { 210 if len(mr) == 0 { 211 return false 212 } 213 for _, tv := range tr { 214 for _, mv := range mr { 215 if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd { 216 return true 217 } 218 } 219 } 220 return false 221} 222 223// targetMatchedRanges finds matching sequences in target and src ordered by target position 224func targetMatchedRanges(src, target *SearchSet) MatchRanges { 225 if src.nodes == nil { 226 return nil 227 } 228 229 var matched MatchRanges 230 var previous *node 231 var possible []MatchRanges 232 for _, tgtNode := range target.nodes { 233 sr, ok := src.Hashes[tgtNode.checksum] 234 if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) { 235 for _, r := range possible { 236 matched = append(matched, r...) 237 } 238 possible = possible[:0] 239 previous = nil 240 } 241 if !ok { 242 // There isn't a match in the source. 243 continue 244 } 245 246 // Maps index within `possible` to the slice of ranges extended by a new range 247 extended := make(map[int]*MatchRanges) 248 // Go over the set of source ranges growing lists of `possible` match ranges. 249 tv := tgtNode.tokens 250 for _, sv := range sr { 251 r := &MatchRange{ 252 SrcStart: sv.Start, 253 SrcEnd: sv.End, 254 TargetStart: tv.Start, 255 TargetEnd: tv.End, 256 } 257 found := false 258 // Grow or extend each abutting `possible` match range. 259 for i, p := range possible { 260 last := p[len(p)-1] 261 if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd { 262 found = true 263 possible[i] = append(possible[i], r) 264 extended[i] = &possible[i] 265 } 266 } 267 if !found { 268 // Did not abut any existing ranges, start a new `possible` match range. 269 mrs := make(MatchRanges, 0, 2) 270 mrs = append(mrs, r) 271 possible = append(possible, mrs) 272 extended[len(possible)-1] = &possible[len(possible)-1] 273 } 274 } 275 if len(extended) < len(possible) { 276 // Ranges not extended--add to `matched` if not included in other range. 277 for i := 0; i < len(possible); { 278 _, updated := extended[i] 279 if updated { 280 i++ // Keep in `possible` and advance to next index. 281 continue 282 } 283 p1 := possible[i] 284 found := false // whether found as subrange of another `possible` match. 285 for _, p2 := range extended { 286 if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart { 287 found = true 288 break 289 } 290 } 291 if !found { 292 matched = append(matched, p1...) 293 } // else included in other match. 294 // Finished -- delete from `possible` and continue from same index. 295 possible = append(possible[:i], possible[i+1:]...) 296 } 297 } 298 previous = tgtNode 299 } 300 // At end of file, terminate all `possible` match ranges. 301 for i := 0; i < len(possible); i++ { 302 p1 := possible[i] 303 found := false // whether found as subrange of another `possible` match. 304 for j := i + 1; j < len(possible); { 305 p2 := possible[j] 306 if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart { 307 // Delete later sub-ranges included in this range. 308 possible = append(possible[:j], possible[j+1:]...) 309 continue 310 } 311 // Skip if subrange of a later range 312 if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart { 313 found = true 314 } 315 j++ 316 } 317 if !found { 318 matched = append(matched, p1...) 319 } 320 } 321 return matched 322} 323 324// untangleSourceRanges goes through the ranges and removes any whose source 325// ranges are "out of order". A source range is "out of order" if the source 326// range is out of sequence with the source ranges before and after it. This 327// happens when more than one source range maps to the same target range. 328// E.g.: 329// 330// SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137 331// 1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143 332// 2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143 333// SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146 334// 335// Here (1) is out of order, because the source range [12, 17) is out of 336// sequence with the surrounding source sequences, but [32, 37) is. 337func untangleSourceRanges(matched MatchRanges) MatchRanges { 338 mr := MatchRanges{matched[0]} 339NEXT: 340 for i := 1; i < len(matched); i++ { 341 if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd { 342 // The matched range has already been added. 343 continue 344 } 345 346 if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) { 347 // A sequence of ranges match the same target range. 348 // Find the first one that has a source range greater 349 // than the currently matched range. Omit all others. 350 if matched[i].SrcStart > mr[len(mr)-1].SrcStart { 351 mr = append(mr, matched[i]) 352 continue 353 } 354 355 for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ { 356 // Check subsequent ranges to see if we can 357 // find one that matches in the correct order. 358 if matched[j].SrcStart > mr[len(mr)-1].SrcStart { 359 mr = append(mr, matched[j]) 360 i = j 361 continue NEXT 362 } 363 } 364 } 365 366 mr = append(mr, matched[i]) 367 } 368 return mr 369} 370 371// equalTargetRange returns true if the two MatchRange's cover the same target range. 372func equalTargetRange(this, that *MatchRange) bool { 373 return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd 374} 375 376// splitRanges splits the matched ranges so that a single match range has a 377// monotonically increasing source range (indicating a single, potential 378// instance of the source in the target). 379func splitRanges(matched MatchRanges) []MatchRanges { 380 var matchedRanges []MatchRanges 381 mr := MatchRanges{matched[0]} 382 for i := 1; i < len(matched); i++ { 383 if mr[len(mr)-1].SrcStart > matched[i].SrcStart { 384 matchedRanges = append(matchedRanges, mr) 385 mr = MatchRanges{matched[i]} 386 } else { 387 mr = append(mr, matched[i]) 388 } 389 } 390 matchedRanges = append(matchedRanges, mr) 391 return matchedRanges 392} 393 394// mergeConsecutiveRanges goes through the matched ranges and merges 395// consecutive ranges. Two ranges are consecutive if the end of the previous 396// matched range and beginning of the next matched range overlap. "matched" 397// should have 1 or more MatchRanges, each with one or more MatchRange objects. 398func mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges { 399 mr := []MatchRanges{matched[0]} 400 401 // Convenience functions. 402 prevMatchedRange := func() MatchRanges { 403 return mr[len(mr)-1] 404 } 405 prevMatchedRangeLastElem := func() *MatchRange { 406 return prevMatchedRange()[len(prevMatchedRange())-1] 407 } 408 409 // This algorithm compares the start of each MatchRanges object to the 410 // end of the previous MatchRanges object. If they overlap, then it 411 // tries to combine them. Note that a 0 offset into a MatchRanges 412 // object (e.g., matched[i][0]) is its first MatchRange, which 413 // indicates the start of the whole matched range. 414NEXT: 415 for i := 1; i < len(matched); i++ { 416 if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart { 417 // Consecutive matched ranges overlap. Merge them. 418 if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart { 419 // The last element of the previous matched 420 // range overlaps with the first element of the 421 // current matched range. Concatenate them. 422 if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd { 423 prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd 424 prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd 425 } 426 mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...) 427 continue 428 } 429 430 for j := 1; j < len(matched[i]); j++ { 431 // Find the positions in the ranges where the 432 // tail end of the previous matched range 433 // overlaps with the start of the next matched 434 // range. 435 for k := len(prevMatchedRange()) - 1; k > 0; k-- { 436 if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart && 437 prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart { 438 // Append the next range to the previous range. 439 if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart { 440 // Coalesce the ranges. 441 prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd 442 prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd 443 } 444 mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...) 445 continue NEXT 446 } 447 } 448 } 449 } 450 mr = append(mr, matched[i]) 451 } 452 return mr 453} 454 455// coalesceMatchRanges coalesces overlapping match ranges into a single 456// contiguous match range. 457func coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges { 458 coalesced := MatchRanges{matchedRanges[0]} 459 for i := 1; i < len(matchedRanges); i++ { 460 c := coalesced[len(coalesced)-1] 461 mr := matchedRanges[i] 462 463 if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart { 464 var se, ts, te int 465 if mr.SrcEnd > c.SrcEnd { 466 se = mr.SrcEnd 467 } else { 468 se = c.SrcEnd 469 } 470 if mr.TargetStart < c.TargetStart { 471 ts = mr.TargetStart 472 } else { 473 ts = c.TargetStart 474 } 475 if mr.TargetEnd > c.TargetEnd { 476 te = mr.TargetEnd 477 } else { 478 te = c.TargetEnd 479 } 480 coalesced[len(coalesced)-1] = &MatchRange{ 481 SrcStart: c.SrcStart, 482 SrcEnd: se, 483 TargetStart: ts, 484 TargetEnd: te, 485 } 486 } else { 487 coalesced = append(coalesced, mr) 488 } 489 } 490 return coalesced 491} 492