licenseclassifier/stringclassifier/classifier.go

*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
*46c4c49dSIbrahim Kanouche// limitations under the License.
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Package stringclassifier finds the nearest match between a string and a set of known values. It
*46c4c49dSIbrahim Kanouche// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
*46c4c49dSIbrahim Kanouche// likely to be correct than one with a small LD. A confidence percentage is returned, which
*46c4c49dSIbrahim Kanouche// indicates how confident the algorithm is that the match is correct. The higher the percentage,
*46c4c49dSIbrahim Kanouche// the greater the confidence that the match is correct.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// Example Usage:
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	type Text struct {
*46c4c49dSIbrahim Kanouche//	  Name string
*46c4c49dSIbrahim Kanouche//	  Text string
*46c4c49dSIbrahim Kanouche//	}
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
*46c4c49dSIbrahim Kanouche//	  sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
*46c4c49dSIbrahim Kanouche//	  for _, known := range knownTexts {
*46c4c49dSIbrahim Kanouche//	    if err := sc.AddValue(known.Name, known.Text); err != nil {
*46c4c49dSIbrahim Kanouche//	      return nil, err
*46c4c49dSIbrahim Kanouche//	    }
*46c4c49dSIbrahim Kanouche//	  }
*46c4c49dSIbrahim Kanouche//	  return sc, nil
*46c4c49dSIbrahim Kanouche//	}
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche//	func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
*46c4c49dSIbrahim Kanouche//	  for _, unknown := range unknownTexts {
*46c4c49dSIbrahim Kanouche//	    m := sc.NearestMatch(unknown.Text)
*46c4c49dSIbrahim Kanouche//	    log.Printf("The nearest match to %q is %q (confidence: %v)",
*46c4c49dSIbrahim Kanouche//	      unknown.Name, m.Name, m.Confidence)
*46c4c49dSIbrahim Kanouche//	  }
*46c4c49dSIbrahim Kanouche//	}
*46c4c49dSIbrahim Kanouchepackage stringclassifier
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanoucheimport (
*46c4c49dSIbrahim Kanouche	"fmt"
*46c4c49dSIbrahim Kanouche	"log"
*46c4c49dSIbrahim Kanouche	"math"
*46c4c49dSIbrahim Kanouche	"regexp"
*46c4c49dSIbrahim Kanouche	"sort"
*46c4c49dSIbrahim Kanouche	"sync"
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/internal/pq"
*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset"
*46c4c49dSIbrahim Kanouche	"github.com/sergi/go-diff/diffmatchpatch"
*46c4c49dSIbrahim Kanouche)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// The diff/match/patch algorithm.
*46c4c49dSIbrahim Kanouchevar dmp = diffmatchpatch.New()
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanoucheconst (
*46c4c49dSIbrahim Kanouche	// DefaultConfidenceThreshold is the minimum ratio threshold between
*46c4c49dSIbrahim Kanouche	// the matching range and the full source range that we're willing to
*46c4c49dSIbrahim Kanouche	// accept in order to say that the matching range will produce a
*46c4c49dSIbrahim Kanouche	// sufficiently good edit distance. I.e., if the matching range is
*46c4c49dSIbrahim Kanouche	// below this threshold we won't run the Levenshtein Distance algorithm
*46c4c49dSIbrahim Kanouche	// on it.
*46c4c49dSIbrahim Kanouche	DefaultConfidenceThreshold float64 = 0.80
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	defaultMinDiffRatio float64 = 0.75
*46c4c49dSIbrahim Kanouche)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// A Classifier matches a string to a set of known values.
*46c4c49dSIbrahim Kanouchetype Classifier struct {
*46c4c49dSIbrahim Kanouche	muValues    sync.RWMutex
*46c4c49dSIbrahim Kanouche	values      map[string]*knownValue
*46c4c49dSIbrahim Kanouche	normalizers []NormalizeFunc
*46c4c49dSIbrahim Kanouche	threshold   float64
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// MinDiffRatio defines the minimum ratio of the length difference
*46c4c49dSIbrahim Kanouche	// allowed to consider a known value a possible match. This is used as
*46c4c49dSIbrahim Kanouche	// a performance optimization to eliminate values that are unlikely to
*46c4c49dSIbrahim Kanouche	// be a match.
*46c4c49dSIbrahim Kanouche	//
*46c4c49dSIbrahim Kanouche	// For example, a value of 0.75 means that the shorter string must be
*46c4c49dSIbrahim Kanouche	// at least 75% the length of the longer string to consider it a
*46c4c49dSIbrahim Kanouche	// possible match.
*46c4c49dSIbrahim Kanouche	//
*46c4c49dSIbrahim Kanouche	// Setting this to 1.0 will require that strings are identical length.
*46c4c49dSIbrahim Kanouche	// Setting this to 0 will consider all known values as possible
*46c4c49dSIbrahim Kanouche	// matches.
*46c4c49dSIbrahim Kanouche	MinDiffRatio float64
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// NormalizeFunc is a function that is used to normalize a string prior to comparison.
*46c4c49dSIbrahim Kanouchetype NormalizeFunc func(string) string
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// New creates a new Classifier with the provided NormalizeFuncs. Each
*46c4c49dSIbrahim Kanouche// NormalizeFunc is applied in order to a string before comparison.
*46c4c49dSIbrahim Kanouchefunc New(threshold float64, funcs ...NormalizeFunc) *Classifier {
*46c4c49dSIbrahim Kanouche	return &Classifier{
*46c4c49dSIbrahim Kanouche		values:       make(map[string]*knownValue),
*46c4c49dSIbrahim Kanouche		normalizers:  append([]NormalizeFunc(nil), funcs...),
*46c4c49dSIbrahim Kanouche		threshold:    threshold,
*46c4c49dSIbrahim Kanouche		MinDiffRatio: defaultMinDiffRatio,
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// knownValue identifies a value in the corpus to match against.
*46c4c49dSIbrahim Kanouchetype knownValue struct {
*46c4c49dSIbrahim Kanouche	key             string
*46c4c49dSIbrahim Kanouche	normalizedValue string
*46c4c49dSIbrahim Kanouche	reValue         *regexp.Regexp
*46c4c49dSIbrahim Kanouche	set             *searchset.SearchSet
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// AddValue adds a known value to be matched against. If a value already exists
*46c4c49dSIbrahim Kanouche// for key, an error is returned.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddValue(key, value string) error {
*46c4c49dSIbrahim Kanouche	c.muValues.Lock()
*46c4c49dSIbrahim Kanouche	defer c.muValues.Unlock()
*46c4c49dSIbrahim Kanouche	if _, ok := c.values[key]; ok {
*46c4c49dSIbrahim Kanouche		return fmt.Errorf("value already registered with key %q", key)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	norm := c.normalize(value)
*46c4c49dSIbrahim Kanouche	c.values[key] = &knownValue{
*46c4c49dSIbrahim Kanouche		key:             key,
*46c4c49dSIbrahim Kanouche		normalizedValue: norm,
*46c4c49dSIbrahim Kanouche		reValue:         regexp.MustCompile(norm),
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return nil
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// AddPrecomputedValue adds a known value to be matched against. The value has
*46c4c49dSIbrahim Kanouche// already been normalized and the SearchSet object deserialized, so no
*46c4c49dSIbrahim Kanouche// processing is necessary.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
*46c4c49dSIbrahim Kanouche	c.muValues.Lock()
*46c4c49dSIbrahim Kanouche	defer c.muValues.Unlock()
*46c4c49dSIbrahim Kanouche	if _, ok := c.values[key]; ok {
*46c4c49dSIbrahim Kanouche		return fmt.Errorf("value already registered with key %q", key)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	set.GenerateNodeList()
*46c4c49dSIbrahim Kanouche	c.values[key] = &knownValue{
*46c4c49dSIbrahim Kanouche		key:             key,
*46c4c49dSIbrahim Kanouche		normalizedValue: value,
*46c4c49dSIbrahim Kanouche		reValue:         regexp.MustCompile(value),
*46c4c49dSIbrahim Kanouche		set:             set,
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return nil
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// normalize a string by applying each of the registered NormalizeFuncs.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) normalize(s string) string {
*46c4c49dSIbrahim Kanouche	for _, fn := range c.normalizers {
*46c4c49dSIbrahim Kanouche		s = fn(s)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return s
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Match identifies the result of matching a string against a knownValue.
*46c4c49dSIbrahim Kanouchetype Match struct {
*46c4c49dSIbrahim Kanouche	Name       string  // Name of knownValue that was matched
*46c4c49dSIbrahim Kanouche	Confidence float64 // Confidence percentage
*46c4c49dSIbrahim Kanouche	Offset     int     // The offset into the unknown string the match was made
*46c4c49dSIbrahim Kanouche	Extent     int     // The length from the offset into the unknown string
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Matches is a list of Match-es. This is here mainly so that the list can be
*46c4c49dSIbrahim Kanouche// sorted.
*46c4c49dSIbrahim Kanouchetype Matches []*Match
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (m Matches) Len() int      { return len(m) }
*46c4c49dSIbrahim Kanouchefunc (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
*46c4c49dSIbrahim Kanouchefunc (m Matches) Less(i, j int) bool {
*46c4c49dSIbrahim Kanouche	if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
*46c4c49dSIbrahim Kanouche		if m[i].Name == m[j].Name {
*46c4c49dSIbrahim Kanouche			if m[i].Offset > m[j].Offset {
*46c4c49dSIbrahim Kanouche				return false
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			if m[i].Offset == m[j].Offset {
*46c4c49dSIbrahim Kanouche				return m[i].Extent > m[j].Extent
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			return true
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		return m[i].Name < m[j].Name
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return m[i].Confidence > m[j].Confidence
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// Names returns an unsorted slice of the names of the matched licenses.
*46c4c49dSIbrahim Kanouchefunc (m Matches) Names() []string {
*46c4c49dSIbrahim Kanouche	var names []string
*46c4c49dSIbrahim Kanouche	for _, n := range m {
*46c4c49dSIbrahim Kanouche		names = append(names, n.Name)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return names
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// uniquify goes through the matches and removes any that are contained within
*46c4c49dSIbrahim Kanouche// one with a higher confidence. This assumes that Matches is sorted.
*46c4c49dSIbrahim Kanouchefunc (m Matches) uniquify() Matches {
*46c4c49dSIbrahim Kanouche	type matchedRange struct {
*46c4c49dSIbrahim Kanouche		offset, extent int
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var matched []matchedRange
*46c4c49dSIbrahim Kanouche	var matches Matches
*46c4c49dSIbrahim KanoucheOUTER:
*46c4c49dSIbrahim Kanouche	for _, match := range m {
*46c4c49dSIbrahim Kanouche		for _, mr := range matched {
*46c4c49dSIbrahim Kanouche			if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
*46c4c49dSIbrahim Kanouche				continue OUTER
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		matched = append(matched, matchedRange{match.Offset, match.Extent})
*46c4c49dSIbrahim Kanouche		matches = append(matches, match)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	return matches
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// NearestMatch returns the name of the known value that most closely matches
*46c4c49dSIbrahim Kanouche// the unknown string and a confidence percentage is returned indicating how
*46c4c49dSIbrahim Kanouche// confident the classifier is in the result. A percentage of "1.0" indicates
*46c4c49dSIbrahim Kanouche// an exact match, while a percentage of "0.0" indicates a complete mismatch.
*46c4c49dSIbrahim Kanouche//
*46c4c49dSIbrahim Kanouche// If the string is equidistant from multiple known values, it is undefined
*46c4c49dSIbrahim Kanouche// which will be returned.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) NearestMatch(s string) *Match {
*46c4c49dSIbrahim Kanouche	pq := c.nearestMatch(s)
*46c4c49dSIbrahim Kanouche	if pq.Len() == 0 {
*46c4c49dSIbrahim Kanouche		return &Match{}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return pq.Pop().(*Match)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// MultipleMatch tries to determine which known strings are found within an
*46c4c49dSIbrahim Kanouche// unknown string. This differs from "NearestMatch" in that it looks only at
*46c4c49dSIbrahim Kanouche// those areas within the unknown string that are likely to match. A list of
*46c4c49dSIbrahim Kanouche// potential matches are returned. It's up to the caller to determine which
*46c4c49dSIbrahim Kanouche// ones are acceptable.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) MultipleMatch(s string) (matches Matches) {
*46c4c49dSIbrahim Kanouche	pq := c.multipleMatch(s)
*46c4c49dSIbrahim Kanouche	if pq == nil {
*46c4c49dSIbrahim Kanouche		return matches
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// A map to remove duplicate entries.
*46c4c49dSIbrahim Kanouche	m := make(map[Match]bool)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	for pq.Len() != 0 {
*46c4c49dSIbrahim Kanouche		v := pq.Pop().(*Match)
*46c4c49dSIbrahim Kanouche		if _, ok := m[*v]; !ok {
*46c4c49dSIbrahim Kanouche			m[*v] = true
*46c4c49dSIbrahim Kanouche			matches = append(matches, v)
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	sort.Sort(matches)
*46c4c49dSIbrahim Kanouche	return matches.uniquify()
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// possibleMatch identifies a known value and it's diffRatio to a given string.
*46c4c49dSIbrahim Kanouchetype possibleMatch struct {
*46c4c49dSIbrahim Kanouche	value     *knownValue
*46c4c49dSIbrahim Kanouche	diffRatio float64
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// likelyMatches is a slice of possibleMatches that can be sorted by their
*46c4c49dSIbrahim Kanouche// diffRatio to a given string, such that the most likely matches (based on
*46c4c49dSIbrahim Kanouche// length) are at the beginning.
*46c4c49dSIbrahim Kanouchetype likelyMatches []possibleMatch
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Len() int           { return len(m) }
*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
*46c4c49dSIbrahim Kanouchefunc (m likelyMatches) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// nearestMatch returns a Queue of values that the unknown string may be. The
*46c4c49dSIbrahim Kanouche// values are compared via their Levenshtein Distance and ranked with the
*46c4c49dSIbrahim Kanouche// nearest match at the beginning.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) nearestMatch(unknown string) *pq.Queue {
*46c4c49dSIbrahim Kanouche	var mu sync.Mutex // Protect the priority queue.
*46c4c49dSIbrahim Kanouche	pq := pq.NewQueue(func(x, y interface{}) bool {
*46c4c49dSIbrahim Kanouche		return x.(*Match).Confidence > y.(*Match).Confidence
*46c4c49dSIbrahim Kanouche	}, nil)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	unknown = c.normalize(unknown)
*46c4c49dSIbrahim Kanouche	if len(unknown) == 0 {
*46c4c49dSIbrahim Kanouche		return pq
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	c.muValues.RLock()
*46c4c49dSIbrahim Kanouche	var likely likelyMatches
*46c4c49dSIbrahim Kanouche	for _, v := range c.values {
*46c4c49dSIbrahim Kanouche		dr := diffRatio(unknown, v.normalizedValue)
*46c4c49dSIbrahim Kanouche		if dr < c.MinDiffRatio {
*46c4c49dSIbrahim Kanouche			continue
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		if unknown == v.normalizedValue {
*46c4c49dSIbrahim Kanouche			// We found an exact match.
*46c4c49dSIbrahim Kanouche			pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
*46c4c49dSIbrahim Kanouche			c.muValues.RUnlock()
*46c4c49dSIbrahim Kanouche			return pq
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		likely = append(likely, possibleMatch{value: v, diffRatio: dr})
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	c.muValues.RUnlock()
*46c4c49dSIbrahim Kanouche	sort.Sort(likely)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
*46c4c49dSIbrahim Kanouche	classifyString := func(name, unknown, known string) {
*46c4c49dSIbrahim Kanouche		defer wg.Done()
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		diffs := dmp.DiffMain(unknown, known, true)
*46c4c49dSIbrahim Kanouche		distance := dmp.DiffLevenshtein(diffs)
*46c4c49dSIbrahim Kanouche		confidence := confidencePercentage(len(unknown), len(known), distance)
*46c4c49dSIbrahim Kanouche		if confidence > 0.0 {
*46c4c49dSIbrahim Kanouche			mu.Lock()
*46c4c49dSIbrahim Kanouche			pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
*46c4c49dSIbrahim Kanouche			mu.Unlock()
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	wg.Add(len(likely))
*46c4c49dSIbrahim Kanouche	for _, known := range likely {
*46c4c49dSIbrahim Kanouche		go classifyString(known.value.key, unknown, known.value.normalizedValue)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	wg.Wait()
*46c4c49dSIbrahim Kanouche	return pq
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// matcher finds all potential matches of "known" in "unknown". The results are
*46c4c49dSIbrahim Kanouche// placed in "queue".
*46c4c49dSIbrahim Kanouchetype matcher struct {
*46c4c49dSIbrahim Kanouche	unknown     *searchset.SearchSet
*46c4c49dSIbrahim Kanouche	normUnknown string
*46c4c49dSIbrahim Kanouche	threshold   float64
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	mu    sync.Mutex
*46c4c49dSIbrahim Kanouche	queue *pq.Queue
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// newMatcher creates a "matcher" object.
*46c4c49dSIbrahim Kanouchefunc newMatcher(unknown string, threshold float64) *matcher {
*46c4c49dSIbrahim Kanouche	return &matcher{
*46c4c49dSIbrahim Kanouche		unknown:     searchset.New(unknown, searchset.DefaultGranularity),
*46c4c49dSIbrahim Kanouche		normUnknown: unknown,
*46c4c49dSIbrahim Kanouche		threshold:   threshold,
*46c4c49dSIbrahim Kanouche		queue: pq.NewQueue(func(x, y interface{}) bool {
*46c4c49dSIbrahim Kanouche			return x.(*Match).Confidence > y.(*Match).Confidence
*46c4c49dSIbrahim Kanouche		}, nil),
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// findMatches takes a known text and finds all potential instances of it in
*46c4c49dSIbrahim Kanouche// the unknown text. The resulting matches can then filtered to determine which
*46c4c49dSIbrahim Kanouche// are the best matches.
*46c4c49dSIbrahim Kanouchefunc (m *matcher) findMatches(known *knownValue) {
*46c4c49dSIbrahim Kanouche	var mrs []searchset.MatchRanges
*46c4c49dSIbrahim Kanouche	if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
*46c4c49dSIbrahim Kanouche		// We found exact matches. Just use those!
*46c4c49dSIbrahim Kanouche		for _, a := range all {
*46c4c49dSIbrahim Kanouche			var start, end int
*46c4c49dSIbrahim Kanouche			for i, tok := range m.unknown.Tokens {
*46c4c49dSIbrahim Kanouche				if tok.Offset == a[0] {
*46c4c49dSIbrahim Kanouche					start = i
*46c4c49dSIbrahim Kanouche				} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
*46c4c49dSIbrahim Kanouche					end = i
*46c4c49dSIbrahim Kanouche					break
*46c4c49dSIbrahim Kanouche				}
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche			mrs = append(mrs, searchset.MatchRanges{{
*46c4c49dSIbrahim Kanouche				SrcStart:    0,
*46c4c49dSIbrahim Kanouche				SrcEnd:      len(known.set.Tokens),
*46c4c49dSIbrahim Kanouche				TargetStart: start,
*46c4c49dSIbrahim Kanouche				TargetEnd:   end + 1,
*46c4c49dSIbrahim Kanouche			}})
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	} else {
*46c4c49dSIbrahim Kanouche		// No exact match. Perform a more thorough match.
*46c4c49dSIbrahim Kanouche		mrs = searchset.FindPotentialMatches(known.set, m.unknown)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
*46c4c49dSIbrahim Kanouche	for _, mr := range mrs {
*46c4c49dSIbrahim Kanouche		if !m.withinConfidenceThreshold(known.set, mr) {
*46c4c49dSIbrahim Kanouche			continue
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche		wg.Add(1)
*46c4c49dSIbrahim Kanouche		go func(mr searchset.MatchRanges) {
*46c4c49dSIbrahim Kanouche			start, end := mr.TargetRange(m.unknown)
*46c4c49dSIbrahim Kanouche			conf := levDist(m.normUnknown[start:end], known.normalizedValue)
*46c4c49dSIbrahim Kanouche			if conf > 0.0 {
*46c4c49dSIbrahim Kanouche				m.mu.Lock()
*46c4c49dSIbrahim Kanouche				m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
*46c4c49dSIbrahim Kanouche				m.mu.Unlock()
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			wg.Done()
*46c4c49dSIbrahim Kanouche		}(mr)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	wg.Wait()
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// withinConfidenceThreshold returns the Confidence we have in the potential
*46c4c49dSIbrahim Kanouche// match. It does this by calculating the ratio of what's matching to the
*46c4c49dSIbrahim Kanouche// original known text.
*46c4c49dSIbrahim Kanouchefunc (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
*46c4c49dSIbrahim Kanouche	return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// multipleMatch returns a Queue of values that might be within the unknown
*46c4c49dSIbrahim Kanouche// string. The values are compared via their Levenshtein Distance and ranked
*46c4c49dSIbrahim Kanouche// with the nearest match at the beginning.
*46c4c49dSIbrahim Kanouchefunc (c *Classifier) multipleMatch(unknown string) *pq.Queue {
*46c4c49dSIbrahim Kanouche	normUnknown := c.normalize(unknown)
*46c4c49dSIbrahim Kanouche	if normUnknown == "" {
*46c4c49dSIbrahim Kanouche		return nil
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	m := newMatcher(normUnknown, c.threshold)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	c.muValues.RLock()
*46c4c49dSIbrahim Kanouche	var kvals []*knownValue
*46c4c49dSIbrahim Kanouche	for _, known := range c.values {
*46c4c49dSIbrahim Kanouche		kvals = append(kvals, known)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	c.muValues.RUnlock()
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
*46c4c49dSIbrahim Kanouche	wg.Add(len(kvals))
*46c4c49dSIbrahim Kanouche	for _, known := range kvals {
*46c4c49dSIbrahim Kanouche		go func(known *knownValue) {
*46c4c49dSIbrahim Kanouche			if known.set == nil {
*46c4c49dSIbrahim Kanouche				k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
*46c4c49dSIbrahim Kanouche				c.muValues.Lock()
*46c4c49dSIbrahim Kanouche				c.values[known.key].set = k
*46c4c49dSIbrahim Kanouche				c.muValues.Unlock()
*46c4c49dSIbrahim Kanouche			}
*46c4c49dSIbrahim Kanouche			m.findMatches(known)
*46c4c49dSIbrahim Kanouche			wg.Done()
*46c4c49dSIbrahim Kanouche		}(known)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	wg.Wait()
*46c4c49dSIbrahim Kanouche	return m.queue
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// levDist runs the Levenshtein Distance algorithm on the known and unknown
*46c4c49dSIbrahim Kanouche// texts to measure how well they match.
*46c4c49dSIbrahim Kanouchefunc levDist(unknown, known string) float64 {
*46c4c49dSIbrahim Kanouche	if len(known) == 0 || len(unknown) == 0 {
*46c4c49dSIbrahim Kanouche		log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
*46c4c49dSIbrahim Kanouche		return 0.0
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// Calculate the differences between the potentially matching known
*46c4c49dSIbrahim Kanouche	// text and the unknown text.
*46c4c49dSIbrahim Kanouche	diffs := dmp.DiffMain(unknown, known, false)
*46c4c49dSIbrahim Kanouche	end := diffRangeEnd(known, diffs)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche	// Now execute the Levenshtein Distance algorithm to see how much it
*46c4c49dSIbrahim Kanouche	// does match.
*46c4c49dSIbrahim Kanouche	distance := dmp.DiffLevenshtein(diffs[:end])
*46c4c49dSIbrahim Kanouche	return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// unknownTextLength returns the length of the unknown text based on the diff range.
*46c4c49dSIbrahim Kanouchefunc unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
*46c4c49dSIbrahim Kanouche	last := len(diffs) - 1
*46c4c49dSIbrahim Kanouche	for ; last >= 0; last-- {
*46c4c49dSIbrahim Kanouche		if diffs[last].Type == diffmatchpatch.DiffEqual {
*46c4c49dSIbrahim Kanouche			break
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	ulen := 0
*46c4c49dSIbrahim Kanouche	for i := 0; i < last+1; i++ {
*46c4c49dSIbrahim Kanouche		switch diffs[i].Type {
*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
*46c4c49dSIbrahim Kanouche			ulen += len(diffs[i].Text)
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return ulen
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// diffRangeEnd returns the end index for the "Diff" objects that constructs
*46c4c49dSIbrahim Kanouche// (or nearly constructs) the "known" value.
*46c4c49dSIbrahim Kanouchefunc diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
*46c4c49dSIbrahim Kanouche	var seen string
*46c4c49dSIbrahim Kanouche	for end = 0; end < len(diffs); end++ {
*46c4c49dSIbrahim Kanouche		if seen == known {
*46c4c49dSIbrahim Kanouche			// Once we've constructed the "known" value, then we've
*46c4c49dSIbrahim Kanouche			// reached the point in the diff list where more
*46c4c49dSIbrahim Kanouche			// "Diff"s would just make the Levenshtein Distance
*46c4c49dSIbrahim Kanouche			// less valid. There shouldn't be further "DiffEqual"
*46c4c49dSIbrahim Kanouche			// nodes, because there's nothing further to match in
*46c4c49dSIbrahim Kanouche			// the "known" text.
*46c4c49dSIbrahim Kanouche			break
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche		switch diffs[end].Type {
*46c4c49dSIbrahim Kanouche		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
*46c4c49dSIbrahim Kanouche			seen += diffs[end].Text
*46c4c49dSIbrahim Kanouche		}
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return end
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// confidencePercentage calculates how confident we are in the result of the
*46c4c49dSIbrahim Kanouche// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
*46c4c49dSIbrahim Kanouche// means a complete mismatch.
*46c4c49dSIbrahim Kanouchefunc confidencePercentage(ulen, klen, distance int) float64 {
*46c4c49dSIbrahim Kanouche	if ulen == 0 && klen == 0 {
*46c4c49dSIbrahim Kanouche		return 1.0
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
*46c4c49dSIbrahim Kanouche		return 0.0
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return 1.0 - float64(distance)/float64(max(ulen, klen))
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// diffRatio calculates the ratio of the length of s1 and s2, returned as a
*46c4c49dSIbrahim Kanouche// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
*46c4c49dSIbrahim Kanouche// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
*46c4c49dSIbrahim Kanouche// strings of equal length will return 1.
*46c4c49dSIbrahim Kanouchefunc diffRatio(s1, s2 string) float64 {
*46c4c49dSIbrahim Kanouche	x, y := len(s1), len(s2)
*46c4c49dSIbrahim Kanouche	if x == 0 && y == 0 {
*46c4c49dSIbrahim Kanouche		// Both strings are zero length
*46c4c49dSIbrahim Kanouche		return 1.0
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	if x < y {
*46c4c49dSIbrahim Kanouche		return float64(x) / float64(y)
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return float64(y) / float64(x)
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc max(a, b int) int {
*46c4c49dSIbrahim Kanouche	if a > b {
*46c4c49dSIbrahim Kanouche		return a
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return b
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouchefunc min(a, b int) int {
*46c4c49dSIbrahim Kanouche	if a < b {
*46c4c49dSIbrahim Kanouche		return a
*46c4c49dSIbrahim Kanouche	}
*46c4c49dSIbrahim Kanouche	return b
*46c4c49dSIbrahim Kanouche}
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// wsRegexp is a regexp used to identify blocks of whitespace.
*46c4c49dSIbrahim Kanouchevar wsRegexp = regexp.MustCompile(`\s+`)
*46c4c49dSIbrahim Kanouche
*46c4c49dSIbrahim Kanouche// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
*46c4c49dSIbrahim Kanouchevar FlattenWhitespace NormalizeFunc = func(s string) string {
*46c4c49dSIbrahim Kanouche	return wsRegexp.ReplaceAllString(s, " ")
*46c4c49dSIbrahim Kanouche}