1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package stringclassifier finds the nearest match between a string and a set of known values. It 16*46c4c49dSIbrahim Kanouche// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less 17*46c4c49dSIbrahim Kanouche// likely to be correct than one with a small LD. A confidence percentage is returned, which 18*46c4c49dSIbrahim Kanouche// indicates how confident the algorithm is that the match is correct. The higher the percentage, 19*46c4c49dSIbrahim Kanouche// the greater the confidence that the match is correct. 20*46c4c49dSIbrahim Kanouche// 21*46c4c49dSIbrahim Kanouche// Example Usage: 22*46c4c49dSIbrahim Kanouche// 23*46c4c49dSIbrahim Kanouche// type Text struct { 24*46c4c49dSIbrahim Kanouche// Name string 25*46c4c49dSIbrahim Kanouche// Text string 26*46c4c49dSIbrahim Kanouche// } 27*46c4c49dSIbrahim Kanouche// 28*46c4c49dSIbrahim Kanouche// func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) { 29*46c4c49dSIbrahim Kanouche// sc := stringclassifier.New(stringclassifier.FlattenWhitespace) 30*46c4c49dSIbrahim Kanouche// for _, known := range knownTexts { 31*46c4c49dSIbrahim Kanouche// if err := sc.AddValue(known.Name, known.Text); err != nil { 32*46c4c49dSIbrahim Kanouche// return nil, err 33*46c4c49dSIbrahim Kanouche// } 34*46c4c49dSIbrahim Kanouche// } 35*46c4c49dSIbrahim Kanouche// return sc, nil 36*46c4c49dSIbrahim Kanouche// } 37*46c4c49dSIbrahim Kanouche// 38*46c4c49dSIbrahim Kanouche// func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) { 39*46c4c49dSIbrahim Kanouche// for _, unknown := range unknownTexts { 40*46c4c49dSIbrahim Kanouche// m := sc.NearestMatch(unknown.Text) 41*46c4c49dSIbrahim Kanouche// log.Printf("The nearest match to %q is %q (confidence: %v)", 42*46c4c49dSIbrahim Kanouche// unknown.Name, m.Name, m.Confidence) 43*46c4c49dSIbrahim Kanouche// } 44*46c4c49dSIbrahim Kanouche// } 45*46c4c49dSIbrahim Kanouchepackage stringclassifier 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanoucheimport ( 48*46c4c49dSIbrahim Kanouche "fmt" 49*46c4c49dSIbrahim Kanouche "log" 50*46c4c49dSIbrahim Kanouche "math" 51*46c4c49dSIbrahim Kanouche "regexp" 52*46c4c49dSIbrahim Kanouche "sort" 53*46c4c49dSIbrahim Kanouche "sync" 54*46c4c49dSIbrahim Kanouche 55*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier/internal/pq" 56*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier/searchset" 57*46c4c49dSIbrahim Kanouche "github.com/sergi/go-diff/diffmatchpatch" 58*46c4c49dSIbrahim Kanouche) 59*46c4c49dSIbrahim Kanouche 60*46c4c49dSIbrahim Kanouche// The diff/match/patch algorithm. 61*46c4c49dSIbrahim Kanouchevar dmp = diffmatchpatch.New() 62*46c4c49dSIbrahim Kanouche 63*46c4c49dSIbrahim Kanoucheconst ( 64*46c4c49dSIbrahim Kanouche // DefaultConfidenceThreshold is the minimum ratio threshold between 65*46c4c49dSIbrahim Kanouche // the matching range and the full source range that we're willing to 66*46c4c49dSIbrahim Kanouche // accept in order to say that the matching range will produce a 67*46c4c49dSIbrahim Kanouche // sufficiently good edit distance. I.e., if the matching range is 68*46c4c49dSIbrahim Kanouche // below this threshold we won't run the Levenshtein Distance algorithm 69*46c4c49dSIbrahim Kanouche // on it. 70*46c4c49dSIbrahim Kanouche DefaultConfidenceThreshold float64 = 0.80 71*46c4c49dSIbrahim Kanouche 72*46c4c49dSIbrahim Kanouche defaultMinDiffRatio float64 = 0.75 73*46c4c49dSIbrahim Kanouche) 74*46c4c49dSIbrahim Kanouche 75*46c4c49dSIbrahim Kanouche// A Classifier matches a string to a set of known values. 76*46c4c49dSIbrahim Kanouchetype Classifier struct { 77*46c4c49dSIbrahim Kanouche muValues sync.RWMutex 78*46c4c49dSIbrahim Kanouche values map[string]*knownValue 79*46c4c49dSIbrahim Kanouche normalizers []NormalizeFunc 80*46c4c49dSIbrahim Kanouche threshold float64 81*46c4c49dSIbrahim Kanouche 82*46c4c49dSIbrahim Kanouche // MinDiffRatio defines the minimum ratio of the length difference 83*46c4c49dSIbrahim Kanouche // allowed to consider a known value a possible match. This is used as 84*46c4c49dSIbrahim Kanouche // a performance optimization to eliminate values that are unlikely to 85*46c4c49dSIbrahim Kanouche // be a match. 86*46c4c49dSIbrahim Kanouche // 87*46c4c49dSIbrahim Kanouche // For example, a value of 0.75 means that the shorter string must be 88*46c4c49dSIbrahim Kanouche // at least 75% the length of the longer string to consider it a 89*46c4c49dSIbrahim Kanouche // possible match. 90*46c4c49dSIbrahim Kanouche // 91*46c4c49dSIbrahim Kanouche // Setting this to 1.0 will require that strings are identical length. 92*46c4c49dSIbrahim Kanouche // Setting this to 0 will consider all known values as possible 93*46c4c49dSIbrahim Kanouche // matches. 94*46c4c49dSIbrahim Kanouche MinDiffRatio float64 95*46c4c49dSIbrahim Kanouche} 96*46c4c49dSIbrahim Kanouche 97*46c4c49dSIbrahim Kanouche// NormalizeFunc is a function that is used to normalize a string prior to comparison. 98*46c4c49dSIbrahim Kanouchetype NormalizeFunc func(string) string 99*46c4c49dSIbrahim Kanouche 100*46c4c49dSIbrahim Kanouche// New creates a new Classifier with the provided NormalizeFuncs. Each 101*46c4c49dSIbrahim Kanouche// NormalizeFunc is applied in order to a string before comparison. 102*46c4c49dSIbrahim Kanouchefunc New(threshold float64, funcs ...NormalizeFunc) *Classifier { 103*46c4c49dSIbrahim Kanouche return &Classifier{ 104*46c4c49dSIbrahim Kanouche values: make(map[string]*knownValue), 105*46c4c49dSIbrahim Kanouche normalizers: append([]NormalizeFunc(nil), funcs...), 106*46c4c49dSIbrahim Kanouche threshold: threshold, 107*46c4c49dSIbrahim Kanouche MinDiffRatio: defaultMinDiffRatio, 108*46c4c49dSIbrahim Kanouche } 109*46c4c49dSIbrahim Kanouche} 110*46c4c49dSIbrahim Kanouche 111*46c4c49dSIbrahim Kanouche// knownValue identifies a value in the corpus to match against. 112*46c4c49dSIbrahim Kanouchetype knownValue struct { 113*46c4c49dSIbrahim Kanouche key string 114*46c4c49dSIbrahim Kanouche normalizedValue string 115*46c4c49dSIbrahim Kanouche reValue *regexp.Regexp 116*46c4c49dSIbrahim Kanouche set *searchset.SearchSet 117*46c4c49dSIbrahim Kanouche} 118*46c4c49dSIbrahim Kanouche 119*46c4c49dSIbrahim Kanouche// AddValue adds a known value to be matched against. If a value already exists 120*46c4c49dSIbrahim Kanouche// for key, an error is returned. 121*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddValue(key, value string) error { 122*46c4c49dSIbrahim Kanouche c.muValues.Lock() 123*46c4c49dSIbrahim Kanouche defer c.muValues.Unlock() 124*46c4c49dSIbrahim Kanouche if _, ok := c.values[key]; ok { 125*46c4c49dSIbrahim Kanouche return fmt.Errorf("value already registered with key %q", key) 126*46c4c49dSIbrahim Kanouche } 127*46c4c49dSIbrahim Kanouche norm := c.normalize(value) 128*46c4c49dSIbrahim Kanouche c.values[key] = &knownValue{ 129*46c4c49dSIbrahim Kanouche key: key, 130*46c4c49dSIbrahim Kanouche normalizedValue: norm, 131*46c4c49dSIbrahim Kanouche reValue: regexp.MustCompile(norm), 132*46c4c49dSIbrahim Kanouche } 133*46c4c49dSIbrahim Kanouche return nil 134*46c4c49dSIbrahim Kanouche} 135*46c4c49dSIbrahim Kanouche 136*46c4c49dSIbrahim Kanouche// AddPrecomputedValue adds a known value to be matched against. The value has 137*46c4c49dSIbrahim Kanouche// already been normalized and the SearchSet object deserialized, so no 138*46c4c49dSIbrahim Kanouche// processing is necessary. 139*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error { 140*46c4c49dSIbrahim Kanouche c.muValues.Lock() 141*46c4c49dSIbrahim Kanouche defer c.muValues.Unlock() 142*46c4c49dSIbrahim Kanouche if _, ok := c.values[key]; ok { 143*46c4c49dSIbrahim Kanouche return fmt.Errorf("value already registered with key %q", key) 144*46c4c49dSIbrahim Kanouche } 145*46c4c49dSIbrahim Kanouche set.GenerateNodeList() 146*46c4c49dSIbrahim Kanouche c.values[key] = &knownValue{ 147*46c4c49dSIbrahim Kanouche key: key, 148*46c4c49dSIbrahim Kanouche normalizedValue: value, 149*46c4c49dSIbrahim Kanouche reValue: regexp.MustCompile(value), 150*46c4c49dSIbrahim Kanouche set: set, 151*46c4c49dSIbrahim Kanouche } 152*46c4c49dSIbrahim Kanouche return nil 153*46c4c49dSIbrahim Kanouche} 154*46c4c49dSIbrahim Kanouche 155*46c4c49dSIbrahim Kanouche// normalize a string by applying each of the registered NormalizeFuncs. 156*46c4c49dSIbrahim Kanouchefunc (c *Classifier) normalize(s string) string { 157*46c4c49dSIbrahim Kanouche for _, fn := range c.normalizers { 158*46c4c49dSIbrahim Kanouche s = fn(s) 159*46c4c49dSIbrahim Kanouche } 160*46c4c49dSIbrahim Kanouche return s 161*46c4c49dSIbrahim Kanouche} 162*46c4c49dSIbrahim Kanouche 163*46c4c49dSIbrahim Kanouche// Match identifies the result of matching a string against a knownValue. 164*46c4c49dSIbrahim Kanouchetype Match struct { 165*46c4c49dSIbrahim Kanouche Name string // Name of knownValue that was matched 166*46c4c49dSIbrahim Kanouche Confidence float64 // Confidence percentage 167*46c4c49dSIbrahim Kanouche Offset int // The offset into the unknown string the match was made 168*46c4c49dSIbrahim Kanouche Extent int // The length from the offset into the unknown string 169*46c4c49dSIbrahim Kanouche} 170*46c4c49dSIbrahim Kanouche 171*46c4c49dSIbrahim Kanouche// Matches is a list of Match-es. This is here mainly so that the list can be 172*46c4c49dSIbrahim Kanouche// sorted. 173*46c4c49dSIbrahim Kanouchetype Matches []*Match 174*46c4c49dSIbrahim Kanouche 175*46c4c49dSIbrahim Kanouchefunc (m Matches) Len() int { return len(m) } 176*46c4c49dSIbrahim Kanouchefunc (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 177*46c4c49dSIbrahim Kanouchefunc (m Matches) Less(i, j int) bool { 178*46c4c49dSIbrahim Kanouche if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 { 179*46c4c49dSIbrahim Kanouche if m[i].Name == m[j].Name { 180*46c4c49dSIbrahim Kanouche if m[i].Offset > m[j].Offset { 181*46c4c49dSIbrahim Kanouche return false 182*46c4c49dSIbrahim Kanouche } 183*46c4c49dSIbrahim Kanouche if m[i].Offset == m[j].Offset { 184*46c4c49dSIbrahim Kanouche return m[i].Extent > m[j].Extent 185*46c4c49dSIbrahim Kanouche } 186*46c4c49dSIbrahim Kanouche return true 187*46c4c49dSIbrahim Kanouche } 188*46c4c49dSIbrahim Kanouche return m[i].Name < m[j].Name 189*46c4c49dSIbrahim Kanouche } 190*46c4c49dSIbrahim Kanouche return m[i].Confidence > m[j].Confidence 191*46c4c49dSIbrahim Kanouche} 192*46c4c49dSIbrahim Kanouche 193*46c4c49dSIbrahim Kanouche// Names returns an unsorted slice of the names of the matched licenses. 194*46c4c49dSIbrahim Kanouchefunc (m Matches) Names() []string { 195*46c4c49dSIbrahim Kanouche var names []string 196*46c4c49dSIbrahim Kanouche for _, n := range m { 197*46c4c49dSIbrahim Kanouche names = append(names, n.Name) 198*46c4c49dSIbrahim Kanouche } 199*46c4c49dSIbrahim Kanouche return names 200*46c4c49dSIbrahim Kanouche} 201*46c4c49dSIbrahim Kanouche 202*46c4c49dSIbrahim Kanouche// uniquify goes through the matches and removes any that are contained within 203*46c4c49dSIbrahim Kanouche// one with a higher confidence. This assumes that Matches is sorted. 204*46c4c49dSIbrahim Kanouchefunc (m Matches) uniquify() Matches { 205*46c4c49dSIbrahim Kanouche type matchedRange struct { 206*46c4c49dSIbrahim Kanouche offset, extent int 207*46c4c49dSIbrahim Kanouche } 208*46c4c49dSIbrahim Kanouche 209*46c4c49dSIbrahim Kanouche var matched []matchedRange 210*46c4c49dSIbrahim Kanouche var matches Matches 211*46c4c49dSIbrahim KanoucheOUTER: 212*46c4c49dSIbrahim Kanouche for _, match := range m { 213*46c4c49dSIbrahim Kanouche for _, mr := range matched { 214*46c4c49dSIbrahim Kanouche if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent { 215*46c4c49dSIbrahim Kanouche continue OUTER 216*46c4c49dSIbrahim Kanouche } 217*46c4c49dSIbrahim Kanouche } 218*46c4c49dSIbrahim Kanouche matched = append(matched, matchedRange{match.Offset, match.Extent}) 219*46c4c49dSIbrahim Kanouche matches = append(matches, match) 220*46c4c49dSIbrahim Kanouche } 221*46c4c49dSIbrahim Kanouche 222*46c4c49dSIbrahim Kanouche return matches 223*46c4c49dSIbrahim Kanouche} 224*46c4c49dSIbrahim Kanouche 225*46c4c49dSIbrahim Kanouche// NearestMatch returns the name of the known value that most closely matches 226*46c4c49dSIbrahim Kanouche// the unknown string and a confidence percentage is returned indicating how 227*46c4c49dSIbrahim Kanouche// confident the classifier is in the result. A percentage of "1.0" indicates 228*46c4c49dSIbrahim Kanouche// an exact match, while a percentage of "0.0" indicates a complete mismatch. 229*46c4c49dSIbrahim Kanouche// 230*46c4c49dSIbrahim Kanouche// If the string is equidistant from multiple known values, it is undefined 231*46c4c49dSIbrahim Kanouche// which will be returned. 232*46c4c49dSIbrahim Kanouchefunc (c *Classifier) NearestMatch(s string) *Match { 233*46c4c49dSIbrahim Kanouche pq := c.nearestMatch(s) 234*46c4c49dSIbrahim Kanouche if pq.Len() == 0 { 235*46c4c49dSIbrahim Kanouche return &Match{} 236*46c4c49dSIbrahim Kanouche } 237*46c4c49dSIbrahim Kanouche return pq.Pop().(*Match) 238*46c4c49dSIbrahim Kanouche} 239*46c4c49dSIbrahim Kanouche 240*46c4c49dSIbrahim Kanouche// MultipleMatch tries to determine which known strings are found within an 241*46c4c49dSIbrahim Kanouche// unknown string. This differs from "NearestMatch" in that it looks only at 242*46c4c49dSIbrahim Kanouche// those areas within the unknown string that are likely to match. A list of 243*46c4c49dSIbrahim Kanouche// potential matches are returned. It's up to the caller to determine which 244*46c4c49dSIbrahim Kanouche// ones are acceptable. 245*46c4c49dSIbrahim Kanouchefunc (c *Classifier) MultipleMatch(s string) (matches Matches) { 246*46c4c49dSIbrahim Kanouche pq := c.multipleMatch(s) 247*46c4c49dSIbrahim Kanouche if pq == nil { 248*46c4c49dSIbrahim Kanouche return matches 249*46c4c49dSIbrahim Kanouche } 250*46c4c49dSIbrahim Kanouche 251*46c4c49dSIbrahim Kanouche // A map to remove duplicate entries. 252*46c4c49dSIbrahim Kanouche m := make(map[Match]bool) 253*46c4c49dSIbrahim Kanouche 254*46c4c49dSIbrahim Kanouche for pq.Len() != 0 { 255*46c4c49dSIbrahim Kanouche v := pq.Pop().(*Match) 256*46c4c49dSIbrahim Kanouche if _, ok := m[*v]; !ok { 257*46c4c49dSIbrahim Kanouche m[*v] = true 258*46c4c49dSIbrahim Kanouche matches = append(matches, v) 259*46c4c49dSIbrahim Kanouche } 260*46c4c49dSIbrahim Kanouche } 261*46c4c49dSIbrahim Kanouche 262*46c4c49dSIbrahim Kanouche sort.Sort(matches) 263*46c4c49dSIbrahim Kanouche return matches.uniquify() 264*46c4c49dSIbrahim Kanouche} 265*46c4c49dSIbrahim Kanouche 266*46c4c49dSIbrahim Kanouche// possibleMatch identifies a known value and it's diffRatio to a given string. 267*46c4c49dSIbrahim Kanouchetype possibleMatch struct { 268*46c4c49dSIbrahim Kanouche value *knownValue 269*46c4c49dSIbrahim Kanouche diffRatio float64 270*46c4c49dSIbrahim Kanouche} 271*46c4c49dSIbrahim Kanouche 272*46c4c49dSIbrahim Kanouche// likelyMatches is a slice of possibleMatches that can be sorted by their 273*46c4c49dSIbrahim Kanouche// diffRatio to a given string, such that the most likely matches (based on 274*46c4c49dSIbrahim Kanouche// length) are at the beginning. 275*46c4c49dSIbrahim Kanouchetype likelyMatches []possibleMatch 276*46c4c49dSIbrahim Kanouche 277*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Len() int { return len(m) } 278*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio } 279*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Swap(i, j int) { m[i], m[j] = m[j], m[i] } 280*46c4c49dSIbrahim Kanouche 281*46c4c49dSIbrahim Kanouche// nearestMatch returns a Queue of values that the unknown string may be. The 282*46c4c49dSIbrahim Kanouche// values are compared via their Levenshtein Distance and ranked with the 283*46c4c49dSIbrahim Kanouche// nearest match at the beginning. 284*46c4c49dSIbrahim Kanouchefunc (c *Classifier) nearestMatch(unknown string) *pq.Queue { 285*46c4c49dSIbrahim Kanouche var mu sync.Mutex // Protect the priority queue. 286*46c4c49dSIbrahim Kanouche pq := pq.NewQueue(func(x, y interface{}) bool { 287*46c4c49dSIbrahim Kanouche return x.(*Match).Confidence > y.(*Match).Confidence 288*46c4c49dSIbrahim Kanouche }, nil) 289*46c4c49dSIbrahim Kanouche 290*46c4c49dSIbrahim Kanouche unknown = c.normalize(unknown) 291*46c4c49dSIbrahim Kanouche if len(unknown) == 0 { 292*46c4c49dSIbrahim Kanouche return pq 293*46c4c49dSIbrahim Kanouche } 294*46c4c49dSIbrahim Kanouche 295*46c4c49dSIbrahim Kanouche c.muValues.RLock() 296*46c4c49dSIbrahim Kanouche var likely likelyMatches 297*46c4c49dSIbrahim Kanouche for _, v := range c.values { 298*46c4c49dSIbrahim Kanouche dr := diffRatio(unknown, v.normalizedValue) 299*46c4c49dSIbrahim Kanouche if dr < c.MinDiffRatio { 300*46c4c49dSIbrahim Kanouche continue 301*46c4c49dSIbrahim Kanouche } 302*46c4c49dSIbrahim Kanouche if unknown == v.normalizedValue { 303*46c4c49dSIbrahim Kanouche // We found an exact match. 304*46c4c49dSIbrahim Kanouche pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)}) 305*46c4c49dSIbrahim Kanouche c.muValues.RUnlock() 306*46c4c49dSIbrahim Kanouche return pq 307*46c4c49dSIbrahim Kanouche } 308*46c4c49dSIbrahim Kanouche likely = append(likely, possibleMatch{value: v, diffRatio: dr}) 309*46c4c49dSIbrahim Kanouche } 310*46c4c49dSIbrahim Kanouche c.muValues.RUnlock() 311*46c4c49dSIbrahim Kanouche sort.Sort(likely) 312*46c4c49dSIbrahim Kanouche 313*46c4c49dSIbrahim Kanouche var wg sync.WaitGroup 314*46c4c49dSIbrahim Kanouche classifyString := func(name, unknown, known string) { 315*46c4c49dSIbrahim Kanouche defer wg.Done() 316*46c4c49dSIbrahim Kanouche 317*46c4c49dSIbrahim Kanouche diffs := dmp.DiffMain(unknown, known, true) 318*46c4c49dSIbrahim Kanouche distance := dmp.DiffLevenshtein(diffs) 319*46c4c49dSIbrahim Kanouche confidence := confidencePercentage(len(unknown), len(known), distance) 320*46c4c49dSIbrahim Kanouche if confidence > 0.0 { 321*46c4c49dSIbrahim Kanouche mu.Lock() 322*46c4c49dSIbrahim Kanouche pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)}) 323*46c4c49dSIbrahim Kanouche mu.Unlock() 324*46c4c49dSIbrahim Kanouche } 325*46c4c49dSIbrahim Kanouche } 326*46c4c49dSIbrahim Kanouche 327*46c4c49dSIbrahim Kanouche wg.Add(len(likely)) 328*46c4c49dSIbrahim Kanouche for _, known := range likely { 329*46c4c49dSIbrahim Kanouche go classifyString(known.value.key, unknown, known.value.normalizedValue) 330*46c4c49dSIbrahim Kanouche } 331*46c4c49dSIbrahim Kanouche wg.Wait() 332*46c4c49dSIbrahim Kanouche return pq 333*46c4c49dSIbrahim Kanouche} 334*46c4c49dSIbrahim Kanouche 335*46c4c49dSIbrahim Kanouche// matcher finds all potential matches of "known" in "unknown". The results are 336*46c4c49dSIbrahim Kanouche// placed in "queue". 337*46c4c49dSIbrahim Kanouchetype matcher struct { 338*46c4c49dSIbrahim Kanouche unknown *searchset.SearchSet 339*46c4c49dSIbrahim Kanouche normUnknown string 340*46c4c49dSIbrahim Kanouche threshold float64 341*46c4c49dSIbrahim Kanouche 342*46c4c49dSIbrahim Kanouche mu sync.Mutex 343*46c4c49dSIbrahim Kanouche queue *pq.Queue 344*46c4c49dSIbrahim Kanouche} 345*46c4c49dSIbrahim Kanouche 346*46c4c49dSIbrahim Kanouche// newMatcher creates a "matcher" object. 347*46c4c49dSIbrahim Kanouchefunc newMatcher(unknown string, threshold float64) *matcher { 348*46c4c49dSIbrahim Kanouche return &matcher{ 349*46c4c49dSIbrahim Kanouche unknown: searchset.New(unknown, searchset.DefaultGranularity), 350*46c4c49dSIbrahim Kanouche normUnknown: unknown, 351*46c4c49dSIbrahim Kanouche threshold: threshold, 352*46c4c49dSIbrahim Kanouche queue: pq.NewQueue(func(x, y interface{}) bool { 353*46c4c49dSIbrahim Kanouche return x.(*Match).Confidence > y.(*Match).Confidence 354*46c4c49dSIbrahim Kanouche }, nil), 355*46c4c49dSIbrahim Kanouche } 356*46c4c49dSIbrahim Kanouche} 357*46c4c49dSIbrahim Kanouche 358*46c4c49dSIbrahim Kanouche// findMatches takes a known text and finds all potential instances of it in 359*46c4c49dSIbrahim Kanouche// the unknown text. The resulting matches can then filtered to determine which 360*46c4c49dSIbrahim Kanouche// are the best matches. 361*46c4c49dSIbrahim Kanouchefunc (m *matcher) findMatches(known *knownValue) { 362*46c4c49dSIbrahim Kanouche var mrs []searchset.MatchRanges 363*46c4c49dSIbrahim Kanouche if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil { 364*46c4c49dSIbrahim Kanouche // We found exact matches. Just use those! 365*46c4c49dSIbrahim Kanouche for _, a := range all { 366*46c4c49dSIbrahim Kanouche var start, end int 367*46c4c49dSIbrahim Kanouche for i, tok := range m.unknown.Tokens { 368*46c4c49dSIbrahim Kanouche if tok.Offset == a[0] { 369*46c4c49dSIbrahim Kanouche start = i 370*46c4c49dSIbrahim Kanouche } else if tok.Offset >= a[len(a)-1]-len(tok.Text) { 371*46c4c49dSIbrahim Kanouche end = i 372*46c4c49dSIbrahim Kanouche break 373*46c4c49dSIbrahim Kanouche } 374*46c4c49dSIbrahim Kanouche } 375*46c4c49dSIbrahim Kanouche 376*46c4c49dSIbrahim Kanouche mrs = append(mrs, searchset.MatchRanges{{ 377*46c4c49dSIbrahim Kanouche SrcStart: 0, 378*46c4c49dSIbrahim Kanouche SrcEnd: len(known.set.Tokens), 379*46c4c49dSIbrahim Kanouche TargetStart: start, 380*46c4c49dSIbrahim Kanouche TargetEnd: end + 1, 381*46c4c49dSIbrahim Kanouche }}) 382*46c4c49dSIbrahim Kanouche } 383*46c4c49dSIbrahim Kanouche } else { 384*46c4c49dSIbrahim Kanouche // No exact match. Perform a more thorough match. 385*46c4c49dSIbrahim Kanouche mrs = searchset.FindPotentialMatches(known.set, m.unknown) 386*46c4c49dSIbrahim Kanouche } 387*46c4c49dSIbrahim Kanouche 388*46c4c49dSIbrahim Kanouche var wg sync.WaitGroup 389*46c4c49dSIbrahim Kanouche for _, mr := range mrs { 390*46c4c49dSIbrahim Kanouche if !m.withinConfidenceThreshold(known.set, mr) { 391*46c4c49dSIbrahim Kanouche continue 392*46c4c49dSIbrahim Kanouche } 393*46c4c49dSIbrahim Kanouche 394*46c4c49dSIbrahim Kanouche wg.Add(1) 395*46c4c49dSIbrahim Kanouche go func(mr searchset.MatchRanges) { 396*46c4c49dSIbrahim Kanouche start, end := mr.TargetRange(m.unknown) 397*46c4c49dSIbrahim Kanouche conf := levDist(m.normUnknown[start:end], known.normalizedValue) 398*46c4c49dSIbrahim Kanouche if conf > 0.0 { 399*46c4c49dSIbrahim Kanouche m.mu.Lock() 400*46c4c49dSIbrahim Kanouche m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start}) 401*46c4c49dSIbrahim Kanouche m.mu.Unlock() 402*46c4c49dSIbrahim Kanouche } 403*46c4c49dSIbrahim Kanouche wg.Done() 404*46c4c49dSIbrahim Kanouche }(mr) 405*46c4c49dSIbrahim Kanouche } 406*46c4c49dSIbrahim Kanouche wg.Wait() 407*46c4c49dSIbrahim Kanouche} 408*46c4c49dSIbrahim Kanouche 409*46c4c49dSIbrahim Kanouche// withinConfidenceThreshold returns the Confidence we have in the potential 410*46c4c49dSIbrahim Kanouche// match. It does this by calculating the ratio of what's matching to the 411*46c4c49dSIbrahim Kanouche// original known text. 412*46c4c49dSIbrahim Kanouchefunc (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool { 413*46c4c49dSIbrahim Kanouche return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold 414*46c4c49dSIbrahim Kanouche} 415*46c4c49dSIbrahim Kanouche 416*46c4c49dSIbrahim Kanouche// multipleMatch returns a Queue of values that might be within the unknown 417*46c4c49dSIbrahim Kanouche// string. The values are compared via their Levenshtein Distance and ranked 418*46c4c49dSIbrahim Kanouche// with the nearest match at the beginning. 419*46c4c49dSIbrahim Kanouchefunc (c *Classifier) multipleMatch(unknown string) *pq.Queue { 420*46c4c49dSIbrahim Kanouche normUnknown := c.normalize(unknown) 421*46c4c49dSIbrahim Kanouche if normUnknown == "" { 422*46c4c49dSIbrahim Kanouche return nil 423*46c4c49dSIbrahim Kanouche } 424*46c4c49dSIbrahim Kanouche 425*46c4c49dSIbrahim Kanouche m := newMatcher(normUnknown, c.threshold) 426*46c4c49dSIbrahim Kanouche 427*46c4c49dSIbrahim Kanouche c.muValues.RLock() 428*46c4c49dSIbrahim Kanouche var kvals []*knownValue 429*46c4c49dSIbrahim Kanouche for _, known := range c.values { 430*46c4c49dSIbrahim Kanouche kvals = append(kvals, known) 431*46c4c49dSIbrahim Kanouche } 432*46c4c49dSIbrahim Kanouche c.muValues.RUnlock() 433*46c4c49dSIbrahim Kanouche 434*46c4c49dSIbrahim Kanouche var wg sync.WaitGroup 435*46c4c49dSIbrahim Kanouche wg.Add(len(kvals)) 436*46c4c49dSIbrahim Kanouche for _, known := range kvals { 437*46c4c49dSIbrahim Kanouche go func(known *knownValue) { 438*46c4c49dSIbrahim Kanouche if known.set == nil { 439*46c4c49dSIbrahim Kanouche k := searchset.New(known.normalizedValue, searchset.DefaultGranularity) 440*46c4c49dSIbrahim Kanouche c.muValues.Lock() 441*46c4c49dSIbrahim Kanouche c.values[known.key].set = k 442*46c4c49dSIbrahim Kanouche c.muValues.Unlock() 443*46c4c49dSIbrahim Kanouche } 444*46c4c49dSIbrahim Kanouche m.findMatches(known) 445*46c4c49dSIbrahim Kanouche wg.Done() 446*46c4c49dSIbrahim Kanouche }(known) 447*46c4c49dSIbrahim Kanouche } 448*46c4c49dSIbrahim Kanouche wg.Wait() 449*46c4c49dSIbrahim Kanouche return m.queue 450*46c4c49dSIbrahim Kanouche} 451*46c4c49dSIbrahim Kanouche 452*46c4c49dSIbrahim Kanouche// levDist runs the Levenshtein Distance algorithm on the known and unknown 453*46c4c49dSIbrahim Kanouche// texts to measure how well they match. 454*46c4c49dSIbrahim Kanouchefunc levDist(unknown, known string) float64 { 455*46c4c49dSIbrahim Kanouche if len(known) == 0 || len(unknown) == 0 { 456*46c4c49dSIbrahim Kanouche log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown)) 457*46c4c49dSIbrahim Kanouche return 0.0 458*46c4c49dSIbrahim Kanouche } 459*46c4c49dSIbrahim Kanouche 460*46c4c49dSIbrahim Kanouche // Calculate the differences between the potentially matching known 461*46c4c49dSIbrahim Kanouche // text and the unknown text. 462*46c4c49dSIbrahim Kanouche diffs := dmp.DiffMain(unknown, known, false) 463*46c4c49dSIbrahim Kanouche end := diffRangeEnd(known, diffs) 464*46c4c49dSIbrahim Kanouche 465*46c4c49dSIbrahim Kanouche // Now execute the Levenshtein Distance algorithm to see how much it 466*46c4c49dSIbrahim Kanouche // does match. 467*46c4c49dSIbrahim Kanouche distance := dmp.DiffLevenshtein(diffs[:end]) 468*46c4c49dSIbrahim Kanouche return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance) 469*46c4c49dSIbrahim Kanouche} 470*46c4c49dSIbrahim Kanouche 471*46c4c49dSIbrahim Kanouche// unknownTextLength returns the length of the unknown text based on the diff range. 472*46c4c49dSIbrahim Kanouchefunc unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int { 473*46c4c49dSIbrahim Kanouche last := len(diffs) - 1 474*46c4c49dSIbrahim Kanouche for ; last >= 0; last-- { 475*46c4c49dSIbrahim Kanouche if diffs[last].Type == diffmatchpatch.DiffEqual { 476*46c4c49dSIbrahim Kanouche break 477*46c4c49dSIbrahim Kanouche } 478*46c4c49dSIbrahim Kanouche } 479*46c4c49dSIbrahim Kanouche ulen := 0 480*46c4c49dSIbrahim Kanouche for i := 0; i < last+1; i++ { 481*46c4c49dSIbrahim Kanouche switch diffs[i].Type { 482*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete: 483*46c4c49dSIbrahim Kanouche ulen += len(diffs[i].Text) 484*46c4c49dSIbrahim Kanouche } 485*46c4c49dSIbrahim Kanouche } 486*46c4c49dSIbrahim Kanouche return ulen 487*46c4c49dSIbrahim Kanouche} 488*46c4c49dSIbrahim Kanouche 489*46c4c49dSIbrahim Kanouche// diffRangeEnd returns the end index for the "Diff" objects that constructs 490*46c4c49dSIbrahim Kanouche// (or nearly constructs) the "known" value. 491*46c4c49dSIbrahim Kanouchefunc diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) { 492*46c4c49dSIbrahim Kanouche var seen string 493*46c4c49dSIbrahim Kanouche for end = 0; end < len(diffs); end++ { 494*46c4c49dSIbrahim Kanouche if seen == known { 495*46c4c49dSIbrahim Kanouche // Once we've constructed the "known" value, then we've 496*46c4c49dSIbrahim Kanouche // reached the point in the diff list where more 497*46c4c49dSIbrahim Kanouche // "Diff"s would just make the Levenshtein Distance 498*46c4c49dSIbrahim Kanouche // less valid. There shouldn't be further "DiffEqual" 499*46c4c49dSIbrahim Kanouche // nodes, because there's nothing further to match in 500*46c4c49dSIbrahim Kanouche // the "known" text. 501*46c4c49dSIbrahim Kanouche break 502*46c4c49dSIbrahim Kanouche } 503*46c4c49dSIbrahim Kanouche switch diffs[end].Type { 504*46c4c49dSIbrahim Kanouche case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert: 505*46c4c49dSIbrahim Kanouche seen += diffs[end].Text 506*46c4c49dSIbrahim Kanouche } 507*46c4c49dSIbrahim Kanouche } 508*46c4c49dSIbrahim Kanouche return end 509*46c4c49dSIbrahim Kanouche} 510*46c4c49dSIbrahim Kanouche 511*46c4c49dSIbrahim Kanouche// confidencePercentage calculates how confident we are in the result of the 512*46c4c49dSIbrahim Kanouche// match. A percentage of "1.0" means an identical match. A confidence of "0.0" 513*46c4c49dSIbrahim Kanouche// means a complete mismatch. 514*46c4c49dSIbrahim Kanouchefunc confidencePercentage(ulen, klen, distance int) float64 { 515*46c4c49dSIbrahim Kanouche if ulen == 0 && klen == 0 { 516*46c4c49dSIbrahim Kanouche return 1.0 517*46c4c49dSIbrahim Kanouche } 518*46c4c49dSIbrahim Kanouche if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) { 519*46c4c49dSIbrahim Kanouche return 0.0 520*46c4c49dSIbrahim Kanouche } 521*46c4c49dSIbrahim Kanouche return 1.0 - float64(distance)/float64(max(ulen, klen)) 522*46c4c49dSIbrahim Kanouche} 523*46c4c49dSIbrahim Kanouche 524*46c4c49dSIbrahim Kanouche// diffRatio calculates the ratio of the length of s1 and s2, returned as a 525*46c4c49dSIbrahim Kanouche// percentage of the length of the longer string. E.g., diffLength("abcd", "e") 526*46c4c49dSIbrahim Kanouche// would return 0.25 because "e" is 25% of the size of "abcd". Comparing 527*46c4c49dSIbrahim Kanouche// strings of equal length will return 1. 528*46c4c49dSIbrahim Kanouchefunc diffRatio(s1, s2 string) float64 { 529*46c4c49dSIbrahim Kanouche x, y := len(s1), len(s2) 530*46c4c49dSIbrahim Kanouche if x == 0 && y == 0 { 531*46c4c49dSIbrahim Kanouche // Both strings are zero length 532*46c4c49dSIbrahim Kanouche return 1.0 533*46c4c49dSIbrahim Kanouche } 534*46c4c49dSIbrahim Kanouche if x < y { 535*46c4c49dSIbrahim Kanouche return float64(x) / float64(y) 536*46c4c49dSIbrahim Kanouche } 537*46c4c49dSIbrahim Kanouche return float64(y) / float64(x) 538*46c4c49dSIbrahim Kanouche} 539*46c4c49dSIbrahim Kanouche 540*46c4c49dSIbrahim Kanouchefunc max(a, b int) int { 541*46c4c49dSIbrahim Kanouche if a > b { 542*46c4c49dSIbrahim Kanouche return a 543*46c4c49dSIbrahim Kanouche } 544*46c4c49dSIbrahim Kanouche return b 545*46c4c49dSIbrahim Kanouche} 546*46c4c49dSIbrahim Kanouche 547*46c4c49dSIbrahim Kanouchefunc min(a, b int) int { 548*46c4c49dSIbrahim Kanouche if a < b { 549*46c4c49dSIbrahim Kanouche return a 550*46c4c49dSIbrahim Kanouche } 551*46c4c49dSIbrahim Kanouche return b 552*46c4c49dSIbrahim Kanouche} 553*46c4c49dSIbrahim Kanouche 554*46c4c49dSIbrahim Kanouche// wsRegexp is a regexp used to identify blocks of whitespace. 555*46c4c49dSIbrahim Kanouchevar wsRegexp = regexp.MustCompile(`\s+`) 556*46c4c49dSIbrahim Kanouche 557*46c4c49dSIbrahim Kanouche// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space. 558*46c4c49dSIbrahim Kanouchevar FlattenWhitespace NormalizeFunc = func(s string) string { 559*46c4c49dSIbrahim Kanouche return wsRegexp.ReplaceAllString(s, " ") 560*46c4c49dSIbrahim Kanouche} 561