xref: /aosp_15_r20/external/licenseclassifier/stringclassifier/classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package stringclassifier finds the nearest match between a string and a set of known values. It
16*46c4c49dSIbrahim Kanouche// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
17*46c4c49dSIbrahim Kanouche// likely to be correct than one with a small LD. A confidence percentage is returned, which
18*46c4c49dSIbrahim Kanouche// indicates how confident the algorithm is that the match is correct. The higher the percentage,
19*46c4c49dSIbrahim Kanouche// the greater the confidence that the match is correct.
20*46c4c49dSIbrahim Kanouche//
21*46c4c49dSIbrahim Kanouche// Example Usage:
22*46c4c49dSIbrahim Kanouche//
23*46c4c49dSIbrahim Kanouche//	type Text struct {
24*46c4c49dSIbrahim Kanouche//	  Name string
25*46c4c49dSIbrahim Kanouche//	  Text string
26*46c4c49dSIbrahim Kanouche//	}
27*46c4c49dSIbrahim Kanouche//
28*46c4c49dSIbrahim Kanouche//	func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
29*46c4c49dSIbrahim Kanouche//	  sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
30*46c4c49dSIbrahim Kanouche//	  for _, known := range knownTexts {
31*46c4c49dSIbrahim Kanouche//	    if err := sc.AddValue(known.Name, known.Text); err != nil {
32*46c4c49dSIbrahim Kanouche//	      return nil, err
33*46c4c49dSIbrahim Kanouche//	    }
34*46c4c49dSIbrahim Kanouche//	  }
35*46c4c49dSIbrahim Kanouche//	  return sc, nil
36*46c4c49dSIbrahim Kanouche//	}
37*46c4c49dSIbrahim Kanouche//
38*46c4c49dSIbrahim Kanouche//	func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
39*46c4c49dSIbrahim Kanouche//	  for _, unknown := range unknownTexts {
40*46c4c49dSIbrahim Kanouche//	    m := sc.NearestMatch(unknown.Text)
41*46c4c49dSIbrahim Kanouche//	    log.Printf("The nearest match to %q is %q (confidence: %v)",
42*46c4c49dSIbrahim Kanouche//	      unknown.Name, m.Name, m.Confidence)
43*46c4c49dSIbrahim Kanouche//	  }
44*46c4c49dSIbrahim Kanouche//	}
45*46c4c49dSIbrahim Kanouchepackage stringclassifier
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanoucheimport (
48*46c4c49dSIbrahim Kanouche	"fmt"
49*46c4c49dSIbrahim Kanouche	"log"
50*46c4c49dSIbrahim Kanouche	"math"
51*46c4c49dSIbrahim Kanouche	"regexp"
52*46c4c49dSIbrahim Kanouche	"sort"
53*46c4c49dSIbrahim Kanouche	"sync"
54*46c4c49dSIbrahim Kanouche
55*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/internal/pq"
56*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset"
57*46c4c49dSIbrahim Kanouche	"github.com/sergi/go-diff/diffmatchpatch"
58*46c4c49dSIbrahim Kanouche)
59*46c4c49dSIbrahim Kanouche
60*46c4c49dSIbrahim Kanouche// The diff/match/patch algorithm.
61*46c4c49dSIbrahim Kanouchevar dmp = diffmatchpatch.New()
62*46c4c49dSIbrahim Kanouche
63*46c4c49dSIbrahim Kanoucheconst (
64*46c4c49dSIbrahim Kanouche	// DefaultConfidenceThreshold is the minimum ratio threshold between
65*46c4c49dSIbrahim Kanouche	// the matching range and the full source range that we're willing to
66*46c4c49dSIbrahim Kanouche	// accept in order to say that the matching range will produce a
67*46c4c49dSIbrahim Kanouche	// sufficiently good edit distance. I.e., if the matching range is
68*46c4c49dSIbrahim Kanouche	// below this threshold we won't run the Levenshtein Distance algorithm
69*46c4c49dSIbrahim Kanouche	// on it.
70*46c4c49dSIbrahim Kanouche	DefaultConfidenceThreshold float64 = 0.80
71*46c4c49dSIbrahim Kanouche
72*46c4c49dSIbrahim Kanouche	defaultMinDiffRatio float64 = 0.75
73*46c4c49dSIbrahim Kanouche)
74*46c4c49dSIbrahim Kanouche
75*46c4c49dSIbrahim Kanouche// A Classifier matches a string to a set of known values.
76*46c4c49dSIbrahim Kanouchetype Classifier struct {
77*46c4c49dSIbrahim Kanouche	muValues    sync.RWMutex
78*46c4c49dSIbrahim Kanouche	values      map[string]*knownValue
79*46c4c49dSIbrahim Kanouche	normalizers []NormalizeFunc
80*46c4c49dSIbrahim Kanouche	threshold   float64
81*46c4c49dSIbrahim Kanouche
82*46c4c49dSIbrahim Kanouche	// MinDiffRatio defines the minimum ratio of the length difference
83*46c4c49dSIbrahim Kanouche	// allowed to consider a known value a possible match. This is used as
84*46c4c49dSIbrahim Kanouche	// a performance optimization to eliminate values that are unlikely to
85*46c4c49dSIbrahim Kanouche	// be a match.
86*46c4c49dSIbrahim Kanouche	//
87*46c4c49dSIbrahim Kanouche	// For example, a value of 0.75 means that the shorter string must be
88*46c4c49dSIbrahim Kanouche	// at least 75% the length of the longer string to consider it a
89*46c4c49dSIbrahim Kanouche	// possible match.
90*46c4c49dSIbrahim Kanouche	//
91*46c4c49dSIbrahim Kanouche	// Setting this to 1.0 will require that strings are identical length.
92*46c4c49dSIbrahim Kanouche	// Setting this to 0 will consider all known values as possible
93*46c4c49dSIbrahim Kanouche	// matches.
94*46c4c49dSIbrahim Kanouche	MinDiffRatio float64
95*46c4c49dSIbrahim Kanouche}
96*46c4c49dSIbrahim Kanouche
97*46c4c49dSIbrahim Kanouche// NormalizeFunc is a function that is used to normalize a string prior to comparison.
98*46c4c49dSIbrahim Kanouchetype NormalizeFunc func(string) string
99*46c4c49dSIbrahim Kanouche
100*46c4c49dSIbrahim Kanouche// New creates a new Classifier with the provided NormalizeFuncs. Each
101*46c4c49dSIbrahim Kanouche// NormalizeFunc is applied in order to a string before comparison.
102*46c4c49dSIbrahim Kanouchefunc New(threshold float64, funcs ...NormalizeFunc) *Classifier {
103*46c4c49dSIbrahim Kanouche	return &Classifier{
104*46c4c49dSIbrahim Kanouche		values:       make(map[string]*knownValue),
105*46c4c49dSIbrahim Kanouche		normalizers:  append([]NormalizeFunc(nil), funcs...),
106*46c4c49dSIbrahim Kanouche		threshold:    threshold,
107*46c4c49dSIbrahim Kanouche		MinDiffRatio: defaultMinDiffRatio,
108*46c4c49dSIbrahim Kanouche	}
109*46c4c49dSIbrahim Kanouche}
110*46c4c49dSIbrahim Kanouche
111*46c4c49dSIbrahim Kanouche// knownValue identifies a value in the corpus to match against.
112*46c4c49dSIbrahim Kanouchetype knownValue struct {
113*46c4c49dSIbrahim Kanouche	key             string
114*46c4c49dSIbrahim Kanouche	normalizedValue string
115*46c4c49dSIbrahim Kanouche	reValue         *regexp.Regexp
116*46c4c49dSIbrahim Kanouche	set             *searchset.SearchSet
117*46c4c49dSIbrahim Kanouche}
118*46c4c49dSIbrahim Kanouche
119*46c4c49dSIbrahim Kanouche// AddValue adds a known value to be matched against. If a value already exists
120*46c4c49dSIbrahim Kanouche// for key, an error is returned.
121*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddValue(key, value string) error {
122*46c4c49dSIbrahim Kanouche	c.muValues.Lock()
123*46c4c49dSIbrahim Kanouche	defer c.muValues.Unlock()
124*46c4c49dSIbrahim Kanouche	if _, ok := c.values[key]; ok {
125*46c4c49dSIbrahim Kanouche		return fmt.Errorf("value already registered with key %q", key)
126*46c4c49dSIbrahim Kanouche	}
127*46c4c49dSIbrahim Kanouche	norm := c.normalize(value)
128*46c4c49dSIbrahim Kanouche	c.values[key] = &knownValue{
129*46c4c49dSIbrahim Kanouche		key:             key,
130*46c4c49dSIbrahim Kanouche		normalizedValue: norm,
131*46c4c49dSIbrahim Kanouche		reValue:         regexp.MustCompile(norm),
132*46c4c49dSIbrahim Kanouche	}
133*46c4c49dSIbrahim Kanouche	return nil
134*46c4c49dSIbrahim Kanouche}
135*46c4c49dSIbrahim Kanouche
136*46c4c49dSIbrahim Kanouche// AddPrecomputedValue adds a known value to be matched against. The value has
137*46c4c49dSIbrahim Kanouche// already been normalized and the SearchSet object deserialized, so no
138*46c4c49dSIbrahim Kanouche// processing is necessary.
139*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
140*46c4c49dSIbrahim Kanouche	c.muValues.Lock()
141*46c4c49dSIbrahim Kanouche	defer c.muValues.Unlock()
142*46c4c49dSIbrahim Kanouche	if _, ok := c.values[key]; ok {
143*46c4c49dSIbrahim Kanouche		return fmt.Errorf("value already registered with key %q", key)
144*46c4c49dSIbrahim Kanouche	}
145*46c4c49dSIbrahim Kanouche	set.GenerateNodeList()
146*46c4c49dSIbrahim Kanouche	c.values[key] = &knownValue{
147*46c4c49dSIbrahim Kanouche		key:             key,
148*46c4c49dSIbrahim Kanouche		normalizedValue: value,
149*46c4c49dSIbrahim Kanouche		reValue:         regexp.MustCompile(value),
150*46c4c49dSIbrahim Kanouche		set:             set,
151*46c4c49dSIbrahim Kanouche	}
152*46c4c49dSIbrahim Kanouche	return nil
153*46c4c49dSIbrahim Kanouche}
154*46c4c49dSIbrahim Kanouche
155*46c4c49dSIbrahim Kanouche// normalize a string by applying each of the registered NormalizeFuncs.
156*46c4c49dSIbrahim Kanouchefunc (c *Classifier) normalize(s string) string {
157*46c4c49dSIbrahim Kanouche	for _, fn := range c.normalizers {
158*46c4c49dSIbrahim Kanouche		s = fn(s)
159*46c4c49dSIbrahim Kanouche	}
160*46c4c49dSIbrahim Kanouche	return s
161*46c4c49dSIbrahim Kanouche}
162*46c4c49dSIbrahim Kanouche
163*46c4c49dSIbrahim Kanouche// Match identifies the result of matching a string against a knownValue.
164*46c4c49dSIbrahim Kanouchetype Match struct {
165*46c4c49dSIbrahim Kanouche	Name       string  // Name of knownValue that was matched
166*46c4c49dSIbrahim Kanouche	Confidence float64 // Confidence percentage
167*46c4c49dSIbrahim Kanouche	Offset     int     // The offset into the unknown string the match was made
168*46c4c49dSIbrahim Kanouche	Extent     int     // The length from the offset into the unknown string
169*46c4c49dSIbrahim Kanouche}
170*46c4c49dSIbrahim Kanouche
171*46c4c49dSIbrahim Kanouche// Matches is a list of Match-es. This is here mainly so that the list can be
172*46c4c49dSIbrahim Kanouche// sorted.
173*46c4c49dSIbrahim Kanouchetype Matches []*Match
174*46c4c49dSIbrahim Kanouche
175*46c4c49dSIbrahim Kanouchefunc (m Matches) Len() int      { return len(m) }
176*46c4c49dSIbrahim Kanouchefunc (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
177*46c4c49dSIbrahim Kanouchefunc (m Matches) Less(i, j int) bool {
178*46c4c49dSIbrahim Kanouche	if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
179*46c4c49dSIbrahim Kanouche		if m[i].Name == m[j].Name {
180*46c4c49dSIbrahim Kanouche			if m[i].Offset > m[j].Offset {
181*46c4c49dSIbrahim Kanouche				return false
182*46c4c49dSIbrahim Kanouche			}
183*46c4c49dSIbrahim Kanouche			if m[i].Offset == m[j].Offset {
184*46c4c49dSIbrahim Kanouche				return m[i].Extent > m[j].Extent
185*46c4c49dSIbrahim Kanouche			}
186*46c4c49dSIbrahim Kanouche			return true
187*46c4c49dSIbrahim Kanouche		}
188*46c4c49dSIbrahim Kanouche		return m[i].Name < m[j].Name
189*46c4c49dSIbrahim Kanouche	}
190*46c4c49dSIbrahim Kanouche	return m[i].Confidence > m[j].Confidence
191*46c4c49dSIbrahim Kanouche}
192*46c4c49dSIbrahim Kanouche
193*46c4c49dSIbrahim Kanouche// Names returns an unsorted slice of the names of the matched licenses.
194*46c4c49dSIbrahim Kanouchefunc (m Matches) Names() []string {
195*46c4c49dSIbrahim Kanouche	var names []string
196*46c4c49dSIbrahim Kanouche	for _, n := range m {
197*46c4c49dSIbrahim Kanouche		names = append(names, n.Name)
198*46c4c49dSIbrahim Kanouche	}
199*46c4c49dSIbrahim Kanouche	return names
200*46c4c49dSIbrahim Kanouche}
201*46c4c49dSIbrahim Kanouche
202*46c4c49dSIbrahim Kanouche// uniquify goes through the matches and removes any that are contained within
203*46c4c49dSIbrahim Kanouche// one with a higher confidence. This assumes that Matches is sorted.
204*46c4c49dSIbrahim Kanouchefunc (m Matches) uniquify() Matches {
205*46c4c49dSIbrahim Kanouche	type matchedRange struct {
206*46c4c49dSIbrahim Kanouche		offset, extent int
207*46c4c49dSIbrahim Kanouche	}
208*46c4c49dSIbrahim Kanouche
209*46c4c49dSIbrahim Kanouche	var matched []matchedRange
210*46c4c49dSIbrahim Kanouche	var matches Matches
211*46c4c49dSIbrahim KanoucheOUTER:
212*46c4c49dSIbrahim Kanouche	for _, match := range m {
213*46c4c49dSIbrahim Kanouche		for _, mr := range matched {
214*46c4c49dSIbrahim Kanouche			if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
215*46c4c49dSIbrahim Kanouche				continue OUTER
216*46c4c49dSIbrahim Kanouche			}
217*46c4c49dSIbrahim Kanouche		}
218*46c4c49dSIbrahim Kanouche		matched = append(matched, matchedRange{match.Offset, match.Extent})
219*46c4c49dSIbrahim Kanouche		matches = append(matches, match)
220*46c4c49dSIbrahim Kanouche	}
221*46c4c49dSIbrahim Kanouche
222*46c4c49dSIbrahim Kanouche	return matches
223*46c4c49dSIbrahim Kanouche}
224*46c4c49dSIbrahim Kanouche
225*46c4c49dSIbrahim Kanouche// NearestMatch returns the name of the known value that most closely matches
226*46c4c49dSIbrahim Kanouche// the unknown string and a confidence percentage is returned indicating how
227*46c4c49dSIbrahim Kanouche// confident the classifier is in the result. A percentage of "1.0" indicates
228*46c4c49dSIbrahim Kanouche// an exact match, while a percentage of "0.0" indicates a complete mismatch.
229*46c4c49dSIbrahim Kanouche//
230*46c4c49dSIbrahim Kanouche// If the string is equidistant from multiple known values, it is undefined
231*46c4c49dSIbrahim Kanouche// which will be returned.
232*46c4c49dSIbrahim Kanouchefunc (c *Classifier) NearestMatch(s string) *Match {
233*46c4c49dSIbrahim Kanouche	pq := c.nearestMatch(s)
234*46c4c49dSIbrahim Kanouche	if pq.Len() == 0 {
235*46c4c49dSIbrahim Kanouche		return &Match{}
236*46c4c49dSIbrahim Kanouche	}
237*46c4c49dSIbrahim Kanouche	return pq.Pop().(*Match)
238*46c4c49dSIbrahim Kanouche}
239*46c4c49dSIbrahim Kanouche
240*46c4c49dSIbrahim Kanouche// MultipleMatch tries to determine which known strings are found within an
241*46c4c49dSIbrahim Kanouche// unknown string. This differs from "NearestMatch" in that it looks only at
242*46c4c49dSIbrahim Kanouche// those areas within the unknown string that are likely to match. A list of
243*46c4c49dSIbrahim Kanouche// potential matches are returned. It's up to the caller to determine which
244*46c4c49dSIbrahim Kanouche// ones are acceptable.
245*46c4c49dSIbrahim Kanouchefunc (c *Classifier) MultipleMatch(s string) (matches Matches) {
246*46c4c49dSIbrahim Kanouche	pq := c.multipleMatch(s)
247*46c4c49dSIbrahim Kanouche	if pq == nil {
248*46c4c49dSIbrahim Kanouche		return matches
249*46c4c49dSIbrahim Kanouche	}
250*46c4c49dSIbrahim Kanouche
251*46c4c49dSIbrahim Kanouche	// A map to remove duplicate entries.
252*46c4c49dSIbrahim Kanouche	m := make(map[Match]bool)
253*46c4c49dSIbrahim Kanouche
254*46c4c49dSIbrahim Kanouche	for pq.Len() != 0 {
255*46c4c49dSIbrahim Kanouche		v := pq.Pop().(*Match)
256*46c4c49dSIbrahim Kanouche		if _, ok := m[*v]; !ok {
257*46c4c49dSIbrahim Kanouche			m[*v] = true
258*46c4c49dSIbrahim Kanouche			matches = append(matches, v)
259*46c4c49dSIbrahim Kanouche		}
260*46c4c49dSIbrahim Kanouche	}
261*46c4c49dSIbrahim Kanouche
262*46c4c49dSIbrahim Kanouche	sort.Sort(matches)
263*46c4c49dSIbrahim Kanouche	return matches.uniquify()
264*46c4c49dSIbrahim Kanouche}
265*46c4c49dSIbrahim Kanouche
266*46c4c49dSIbrahim Kanouche// possibleMatch identifies a known value and it's diffRatio to a given string.
267*46c4c49dSIbrahim Kanouchetype possibleMatch struct {
268*46c4c49dSIbrahim Kanouche	value     *knownValue
269*46c4c49dSIbrahim Kanouche	diffRatio float64
270*46c4c49dSIbrahim Kanouche}
271*46c4c49dSIbrahim Kanouche
272*46c4c49dSIbrahim Kanouche// likelyMatches is a slice of possibleMatches that can be sorted by their
273*46c4c49dSIbrahim Kanouche// diffRatio to a given string, such that the most likely matches (based on
274*46c4c49dSIbrahim Kanouche// length) are at the beginning.
275*46c4c49dSIbrahim Kanouchetype likelyMatches []possibleMatch
276*46c4c49dSIbrahim Kanouche
277*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Len() int           { return len(m) }
278*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
279*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
280*46c4c49dSIbrahim Kanouche
281*46c4c49dSIbrahim Kanouche// nearestMatch returns a Queue of values that the unknown string may be. The
282*46c4c49dSIbrahim Kanouche// values are compared via their Levenshtein Distance and ranked with the
283*46c4c49dSIbrahim Kanouche// nearest match at the beginning.
284*46c4c49dSIbrahim Kanouchefunc (c *Classifier) nearestMatch(unknown string) *pq.Queue {
285*46c4c49dSIbrahim Kanouche	var mu sync.Mutex // Protect the priority queue.
286*46c4c49dSIbrahim Kanouche	pq := pq.NewQueue(func(x, y interface{}) bool {
287*46c4c49dSIbrahim Kanouche		return x.(*Match).Confidence > y.(*Match).Confidence
288*46c4c49dSIbrahim Kanouche	}, nil)
289*46c4c49dSIbrahim Kanouche
290*46c4c49dSIbrahim Kanouche	unknown = c.normalize(unknown)
291*46c4c49dSIbrahim Kanouche	if len(unknown) == 0 {
292*46c4c49dSIbrahim Kanouche		return pq
293*46c4c49dSIbrahim Kanouche	}
294*46c4c49dSIbrahim Kanouche
295*46c4c49dSIbrahim Kanouche	c.muValues.RLock()
296*46c4c49dSIbrahim Kanouche	var likely likelyMatches
297*46c4c49dSIbrahim Kanouche	for _, v := range c.values {
298*46c4c49dSIbrahim Kanouche		dr := diffRatio(unknown, v.normalizedValue)
299*46c4c49dSIbrahim Kanouche		if dr < c.MinDiffRatio {
300*46c4c49dSIbrahim Kanouche			continue
301*46c4c49dSIbrahim Kanouche		}
302*46c4c49dSIbrahim Kanouche		if unknown == v.normalizedValue {
303*46c4c49dSIbrahim Kanouche			// We found an exact match.
304*46c4c49dSIbrahim Kanouche			pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
305*46c4c49dSIbrahim Kanouche			c.muValues.RUnlock()
306*46c4c49dSIbrahim Kanouche			return pq
307*46c4c49dSIbrahim Kanouche		}
308*46c4c49dSIbrahim Kanouche		likely = append(likely, possibleMatch{value: v, diffRatio: dr})
309*46c4c49dSIbrahim Kanouche	}
310*46c4c49dSIbrahim Kanouche	c.muValues.RUnlock()
311*46c4c49dSIbrahim Kanouche	sort.Sort(likely)
312*46c4c49dSIbrahim Kanouche
313*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
314*46c4c49dSIbrahim Kanouche	classifyString := func(name, unknown, known string) {
315*46c4c49dSIbrahim Kanouche		defer wg.Done()
316*46c4c49dSIbrahim Kanouche
317*46c4c49dSIbrahim Kanouche		diffs := dmp.DiffMain(unknown, known, true)
318*46c4c49dSIbrahim Kanouche		distance := dmp.DiffLevenshtein(diffs)
319*46c4c49dSIbrahim Kanouche		confidence := confidencePercentage(len(unknown), len(known), distance)
320*46c4c49dSIbrahim Kanouche		if confidence > 0.0 {
321*46c4c49dSIbrahim Kanouche			mu.Lock()
322*46c4c49dSIbrahim Kanouche			pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
323*46c4c49dSIbrahim Kanouche			mu.Unlock()
324*46c4c49dSIbrahim Kanouche		}
325*46c4c49dSIbrahim Kanouche	}
326*46c4c49dSIbrahim Kanouche
327*46c4c49dSIbrahim Kanouche	wg.Add(len(likely))
328*46c4c49dSIbrahim Kanouche	for _, known := range likely {
329*46c4c49dSIbrahim Kanouche		go classifyString(known.value.key, unknown, known.value.normalizedValue)
330*46c4c49dSIbrahim Kanouche	}
331*46c4c49dSIbrahim Kanouche	wg.Wait()
332*46c4c49dSIbrahim Kanouche	return pq
333*46c4c49dSIbrahim Kanouche}
334*46c4c49dSIbrahim Kanouche
335*46c4c49dSIbrahim Kanouche// matcher finds all potential matches of "known" in "unknown". The results are
336*46c4c49dSIbrahim Kanouche// placed in "queue".
337*46c4c49dSIbrahim Kanouchetype matcher struct {
338*46c4c49dSIbrahim Kanouche	unknown     *searchset.SearchSet
339*46c4c49dSIbrahim Kanouche	normUnknown string
340*46c4c49dSIbrahim Kanouche	threshold   float64
341*46c4c49dSIbrahim Kanouche
342*46c4c49dSIbrahim Kanouche	mu    sync.Mutex
343*46c4c49dSIbrahim Kanouche	queue *pq.Queue
344*46c4c49dSIbrahim Kanouche}
345*46c4c49dSIbrahim Kanouche
346*46c4c49dSIbrahim Kanouche// newMatcher creates a "matcher" object.
347*46c4c49dSIbrahim Kanouchefunc newMatcher(unknown string, threshold float64) *matcher {
348*46c4c49dSIbrahim Kanouche	return &matcher{
349*46c4c49dSIbrahim Kanouche		unknown:     searchset.New(unknown, searchset.DefaultGranularity),
350*46c4c49dSIbrahim Kanouche		normUnknown: unknown,
351*46c4c49dSIbrahim Kanouche		threshold:   threshold,
352*46c4c49dSIbrahim Kanouche		queue: pq.NewQueue(func(x, y interface{}) bool {
353*46c4c49dSIbrahim Kanouche			return x.(*Match).Confidence > y.(*Match).Confidence
354*46c4c49dSIbrahim Kanouche		}, nil),
355*46c4c49dSIbrahim Kanouche	}
356*46c4c49dSIbrahim Kanouche}
357*46c4c49dSIbrahim Kanouche
358*46c4c49dSIbrahim Kanouche// findMatches takes a known text and finds all potential instances of it in
359*46c4c49dSIbrahim Kanouche// the unknown text. The resulting matches can then filtered to determine which
360*46c4c49dSIbrahim Kanouche// are the best matches.
361*46c4c49dSIbrahim Kanouchefunc (m *matcher) findMatches(known *knownValue) {
362*46c4c49dSIbrahim Kanouche	var mrs []searchset.MatchRanges
363*46c4c49dSIbrahim Kanouche	if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
364*46c4c49dSIbrahim Kanouche		// We found exact matches. Just use those!
365*46c4c49dSIbrahim Kanouche		for _, a := range all {
366*46c4c49dSIbrahim Kanouche			var start, end int
367*46c4c49dSIbrahim Kanouche			for i, tok := range m.unknown.Tokens {
368*46c4c49dSIbrahim Kanouche				if tok.Offset == a[0] {
369*46c4c49dSIbrahim Kanouche					start = i
370*46c4c49dSIbrahim Kanouche				} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
371*46c4c49dSIbrahim Kanouche					end = i
372*46c4c49dSIbrahim Kanouche					break
373*46c4c49dSIbrahim Kanouche				}
374*46c4c49dSIbrahim Kanouche			}
375*46c4c49dSIbrahim Kanouche
376*46c4c49dSIbrahim Kanouche			mrs = append(mrs, searchset.MatchRanges{{
377*46c4c49dSIbrahim Kanouche				SrcStart:    0,
378*46c4c49dSIbrahim Kanouche				SrcEnd:      len(known.set.Tokens),
379*46c4c49dSIbrahim Kanouche				TargetStart: start,
380*46c4c49dSIbrahim Kanouche				TargetEnd:   end + 1,
381*46c4c49dSIbrahim Kanouche			}})
382*46c4c49dSIbrahim Kanouche		}
383*46c4c49dSIbrahim Kanouche	} else {
384*46c4c49dSIbrahim Kanouche		// No exact match. Perform a more thorough match.
385*46c4c49dSIbrahim Kanouche		mrs = searchset.FindPotentialMatches(known.set, m.unknown)
386*46c4c49dSIbrahim Kanouche	}
387*46c4c49dSIbrahim Kanouche
388*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
389*46c4c49dSIbrahim Kanouche	for _, mr := range mrs {
390*46c4c49dSIbrahim Kanouche		if !m.withinConfidenceThreshold(known.set, mr) {
391*46c4c49dSIbrahim Kanouche			continue
392*46c4c49dSIbrahim Kanouche		}
393*46c4c49dSIbrahim Kanouche
394*46c4c49dSIbrahim Kanouche		wg.Add(1)
395*46c4c49dSIbrahim Kanouche		go func(mr searchset.MatchRanges) {
396*46c4c49dSIbrahim Kanouche			start, end := mr.TargetRange(m.unknown)
397*46c4c49dSIbrahim Kanouche			conf := levDist(m.normUnknown[start:end], known.normalizedValue)
398*46c4c49dSIbrahim Kanouche			if conf > 0.0 {
399*46c4c49dSIbrahim Kanouche				m.mu.Lock()
400*46c4c49dSIbrahim Kanouche				m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
401*46c4c49dSIbrahim Kanouche				m.mu.Unlock()
402*46c4c49dSIbrahim Kanouche			}
403*46c4c49dSIbrahim Kanouche			wg.Done()
404*46c4c49dSIbrahim Kanouche		}(mr)
405*46c4c49dSIbrahim Kanouche	}
406*46c4c49dSIbrahim Kanouche	wg.Wait()
407*46c4c49dSIbrahim Kanouche}
408*46c4c49dSIbrahim Kanouche
409*46c4c49dSIbrahim Kanouche// withinConfidenceThreshold returns the Confidence we have in the potential
410*46c4c49dSIbrahim Kanouche// match. It does this by calculating the ratio of what's matching to the
411*46c4c49dSIbrahim Kanouche// original known text.
412*46c4c49dSIbrahim Kanouchefunc (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
413*46c4c49dSIbrahim Kanouche	return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
414*46c4c49dSIbrahim Kanouche}
415*46c4c49dSIbrahim Kanouche
416*46c4c49dSIbrahim Kanouche// multipleMatch returns a Queue of values that might be within the unknown
417*46c4c49dSIbrahim Kanouche// string. The values are compared via their Levenshtein Distance and ranked
418*46c4c49dSIbrahim Kanouche// with the nearest match at the beginning.
419*46c4c49dSIbrahim Kanouchefunc (c *Classifier) multipleMatch(unknown string) *pq.Queue {
420*46c4c49dSIbrahim Kanouche	normUnknown := c.normalize(unknown)
421*46c4c49dSIbrahim Kanouche	if normUnknown == "" {
422*46c4c49dSIbrahim Kanouche		return nil
423*46c4c49dSIbrahim Kanouche	}
424*46c4c49dSIbrahim Kanouche
425*46c4c49dSIbrahim Kanouche	m := newMatcher(normUnknown, c.threshold)
426*46c4c49dSIbrahim Kanouche
427*46c4c49dSIbrahim Kanouche	c.muValues.RLock()
428*46c4c49dSIbrahim Kanouche	var kvals []*knownValue
429*46c4c49dSIbrahim Kanouche	for _, known := range c.values {
430*46c4c49dSIbrahim Kanouche		kvals = append(kvals, known)
431*46c4c49dSIbrahim Kanouche	}
432*46c4c49dSIbrahim Kanouche	c.muValues.RUnlock()
433*46c4c49dSIbrahim Kanouche
434*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
435*46c4c49dSIbrahim Kanouche	wg.Add(len(kvals))
436*46c4c49dSIbrahim Kanouche	for _, known := range kvals {
437*46c4c49dSIbrahim Kanouche		go func(known *knownValue) {
438*46c4c49dSIbrahim Kanouche			if known.set == nil {
439*46c4c49dSIbrahim Kanouche				k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
440*46c4c49dSIbrahim Kanouche				c.muValues.Lock()
441*46c4c49dSIbrahim Kanouche				c.values[known.key].set = k
442*46c4c49dSIbrahim Kanouche				c.muValues.Unlock()
443*46c4c49dSIbrahim Kanouche			}
444*46c4c49dSIbrahim Kanouche			m.findMatches(known)
445*46c4c49dSIbrahim Kanouche			wg.Done()
446*46c4c49dSIbrahim Kanouche		}(known)
447*46c4c49dSIbrahim Kanouche	}
448*46c4c49dSIbrahim Kanouche	wg.Wait()
449*46c4c49dSIbrahim Kanouche	return m.queue
450*46c4c49dSIbrahim Kanouche}
451*46c4c49dSIbrahim Kanouche
452*46c4c49dSIbrahim Kanouche// levDist runs the Levenshtein Distance algorithm on the known and unknown
453*46c4c49dSIbrahim Kanouche// texts to measure how well they match.
454*46c4c49dSIbrahim Kanouchefunc levDist(unknown, known string) float64 {
455*46c4c49dSIbrahim Kanouche	if len(known) == 0 || len(unknown) == 0 {
456*46c4c49dSIbrahim Kanouche		log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
457*46c4c49dSIbrahim Kanouche		return 0.0
458*46c4c49dSIbrahim Kanouche	}
459*46c4c49dSIbrahim Kanouche
460*46c4c49dSIbrahim Kanouche	// Calculate the differences between the potentially matching known
461*46c4c49dSIbrahim Kanouche	// text and the unknown text.
462*46c4c49dSIbrahim Kanouche	diffs := dmp.DiffMain(unknown, known, false)
463*46c4c49dSIbrahim Kanouche	end := diffRangeEnd(known, diffs)
464*46c4c49dSIbrahim Kanouche
465*46c4c49dSIbrahim Kanouche	// Now execute the Levenshtein Distance algorithm to see how much it
466*46c4c49dSIbrahim Kanouche	// does match.
467*46c4c49dSIbrahim Kanouche	distance := dmp.DiffLevenshtein(diffs[:end])
468*46c4c49dSIbrahim Kanouche	return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
469*46c4c49dSIbrahim Kanouche}
470*46c4c49dSIbrahim Kanouche
471*46c4c49dSIbrahim Kanouche// unknownTextLength returns the length of the unknown text based on the diff range.
472*46c4c49dSIbrahim Kanouchefunc unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
473*46c4c49dSIbrahim Kanouche	last := len(diffs) - 1
474*46c4c49dSIbrahim Kanouche	for ; last >= 0; last-- {
475*46c4c49dSIbrahim Kanouche		if diffs[last].Type == diffmatchpatch.DiffEqual {
476*46c4c49dSIbrahim Kanouche			break
477*46c4c49dSIbrahim Kanouche		}
478*46c4c49dSIbrahim Kanouche	}
479*46c4c49dSIbrahim Kanouche	ulen := 0
480*46c4c49dSIbrahim Kanouche	for i := 0; i < last+1; i++ {
481*46c4c49dSIbrahim Kanouche		switch diffs[i].Type {
482*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
483*46c4c49dSIbrahim Kanouche			ulen += len(diffs[i].Text)
484*46c4c49dSIbrahim Kanouche		}
485*46c4c49dSIbrahim Kanouche	}
486*46c4c49dSIbrahim Kanouche	return ulen
487*46c4c49dSIbrahim Kanouche}
488*46c4c49dSIbrahim Kanouche
489*46c4c49dSIbrahim Kanouche// diffRangeEnd returns the end index for the "Diff" objects that constructs
490*46c4c49dSIbrahim Kanouche// (or nearly constructs) the "known" value.
491*46c4c49dSIbrahim Kanouchefunc diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
492*46c4c49dSIbrahim Kanouche	var seen string
493*46c4c49dSIbrahim Kanouche	for end = 0; end < len(diffs); end++ {
494*46c4c49dSIbrahim Kanouche		if seen == known {
495*46c4c49dSIbrahim Kanouche			// Once we've constructed the "known" value, then we've
496*46c4c49dSIbrahim Kanouche			// reached the point in the diff list where more
497*46c4c49dSIbrahim Kanouche			// "Diff"s would just make the Levenshtein Distance
498*46c4c49dSIbrahim Kanouche			// less valid. There shouldn't be further "DiffEqual"
499*46c4c49dSIbrahim Kanouche			// nodes, because there's nothing further to match in
500*46c4c49dSIbrahim Kanouche			// the "known" text.
501*46c4c49dSIbrahim Kanouche			break
502*46c4c49dSIbrahim Kanouche		}
503*46c4c49dSIbrahim Kanouche		switch diffs[end].Type {
504*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
505*46c4c49dSIbrahim Kanouche			seen += diffs[end].Text
506*46c4c49dSIbrahim Kanouche		}
507*46c4c49dSIbrahim Kanouche	}
508*46c4c49dSIbrahim Kanouche	return end
509*46c4c49dSIbrahim Kanouche}
510*46c4c49dSIbrahim Kanouche
511*46c4c49dSIbrahim Kanouche// confidencePercentage calculates how confident we are in the result of the
512*46c4c49dSIbrahim Kanouche// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
513*46c4c49dSIbrahim Kanouche// means a complete mismatch.
514*46c4c49dSIbrahim Kanouchefunc confidencePercentage(ulen, klen, distance int) float64 {
515*46c4c49dSIbrahim Kanouche	if ulen == 0 && klen == 0 {
516*46c4c49dSIbrahim Kanouche		return 1.0
517*46c4c49dSIbrahim Kanouche	}
518*46c4c49dSIbrahim Kanouche	if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
519*46c4c49dSIbrahim Kanouche		return 0.0
520*46c4c49dSIbrahim Kanouche	}
521*46c4c49dSIbrahim Kanouche	return 1.0 - float64(distance)/float64(max(ulen, klen))
522*46c4c49dSIbrahim Kanouche}
523*46c4c49dSIbrahim Kanouche
524*46c4c49dSIbrahim Kanouche// diffRatio calculates the ratio of the length of s1 and s2, returned as a
525*46c4c49dSIbrahim Kanouche// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
526*46c4c49dSIbrahim Kanouche// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
527*46c4c49dSIbrahim Kanouche// strings of equal length will return 1.
528*46c4c49dSIbrahim Kanouchefunc diffRatio(s1, s2 string) float64 {
529*46c4c49dSIbrahim Kanouche	x, y := len(s1), len(s2)
530*46c4c49dSIbrahim Kanouche	if x == 0 && y == 0 {
531*46c4c49dSIbrahim Kanouche		// Both strings are zero length
532*46c4c49dSIbrahim Kanouche		return 1.0
533*46c4c49dSIbrahim Kanouche	}
534*46c4c49dSIbrahim Kanouche	if x < y {
535*46c4c49dSIbrahim Kanouche		return float64(x) / float64(y)
536*46c4c49dSIbrahim Kanouche	}
537*46c4c49dSIbrahim Kanouche	return float64(y) / float64(x)
538*46c4c49dSIbrahim Kanouche}
539*46c4c49dSIbrahim Kanouche
540*46c4c49dSIbrahim Kanouchefunc max(a, b int) int {
541*46c4c49dSIbrahim Kanouche	if a > b {
542*46c4c49dSIbrahim Kanouche		return a
543*46c4c49dSIbrahim Kanouche	}
544*46c4c49dSIbrahim Kanouche	return b
545*46c4c49dSIbrahim Kanouche}
546*46c4c49dSIbrahim Kanouche
547*46c4c49dSIbrahim Kanouchefunc min(a, b int) int {
548*46c4c49dSIbrahim Kanouche	if a < b {
549*46c4c49dSIbrahim Kanouche		return a
550*46c4c49dSIbrahim Kanouche	}
551*46c4c49dSIbrahim Kanouche	return b
552*46c4c49dSIbrahim Kanouche}
553*46c4c49dSIbrahim Kanouche
554*46c4c49dSIbrahim Kanouche// wsRegexp is a regexp used to identify blocks of whitespace.
555*46c4c49dSIbrahim Kanouchevar wsRegexp = regexp.MustCompile(`\s+`)
556*46c4c49dSIbrahim Kanouche
557*46c4c49dSIbrahim Kanouche// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
558*46c4c49dSIbrahim Kanouchevar FlattenWhitespace NormalizeFunc = func(s string) string {
559*46c4c49dSIbrahim Kanouche	return wsRegexp.ReplaceAllString(s, " ")
560*46c4c49dSIbrahim Kanouche}
561