classifier.go (revision 46c4c49da23cae783fa41bf46525a6505638499a) - OpenGrok cross reference for /aosp_15_r20/external/licenseclassifier/stringclassifier/classifier.go

// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package stringclassifier finds the nearest match between a string and a set of known values. It
// uses the Levenshtein Distance (LD) algorithm to determine this. A match with a large LD is less
// likely to be correct than one with a small LD. A confidence percentage is returned, which
// indicates how confident the algorithm is that the match is correct. The higher the percentage,
// the greater the confidence that the match is correct.
//
// Example Usage:
//
//	type Text struct {
//	  Name string
//	  Text string
//	}
//
//	func NewClassifier(knownTexts []Text) (*stringclassifier.Classifier, error) {
//	  sc := stringclassifier.New(stringclassifier.FlattenWhitespace)
//	  for _, known := range knownTexts {
//	    if err := sc.AddValue(known.Name, known.Text); err != nil {
//	      return nil, err
//	    }
//	  }
//	  return sc, nil
//	}
//
//	func IdentifyTexts(sc *stringclassifier.Classifier, unknownTexts []*Text) {
//	  for _, unknown := range unknownTexts {
//	    m := sc.NearestMatch(unknown.Text)
//	    log.Printf("The nearest match to %q is %q (confidence: %v)",
//	      unknown.Name, m.Name, m.Confidence)
//	  }
//	}
package stringclassifier

import (
	"fmt"
	"log"
	"math"
	"regexp"
	"sort"
	"sync"

	"github.com/google/licenseclassifier/stringclassifier/internal/pq"
	"github.com/google/licenseclassifier/stringclassifier/searchset"
	"github.com/sergi/go-diff/diffmatchpatch"
)

// The diff/match/patch algorithm.
var dmp = diffmatchpatch.New()

const (
	// DefaultConfidenceThreshold is the minimum ratio threshold between
	// the matching range and the full source range that we're willing to
	// accept in order to say that the matching range will produce a
	// sufficiently good edit distance. I.e., if the matching range is
	// below this threshold we won't run the Levenshtein Distance algorithm
	// on it.
	DefaultConfidenceThreshold float64 = 0.80

	defaultMinDiffRatio float64 = 0.75
)

// A Classifier matches a string to a set of known values.
type Classifier struct {
	muValues    sync.RWMutex
	values      map[string]*knownValue
	normalizers []NormalizeFunc
	threshold   float64

	// MinDiffRatio defines the minimum ratio of the length difference
	// allowed to consider a known value a possible match. This is used as
	// a performance optimization to eliminate values that are unlikely to
	// be a match.
	//
	// For example, a value of 0.75 means that the shorter string must be
	// at least 75% the length of the longer string to consider it a
	// possible match.
	//
	// Setting this to 1.0 will require that strings are identical length.
	// Setting this to 0 will consider all known values as possible
	// matches.
	MinDiffRatio float64
}

// NormalizeFunc is a function that is used to normalize a string prior to comparison.
type NormalizeFunc func(string) string

// New creates a new Classifier with the provided NormalizeFuncs. Each
// NormalizeFunc is applied in order to a string before comparison.
func New(threshold float64, funcs ...NormalizeFunc) *Classifier {
	return &Classifier{
		values:       make(map[string]*knownValue),
		normalizers:  append([]NormalizeFunc(nil), funcs...),
		threshold:    threshold,
		MinDiffRatio: defaultMinDiffRatio,
	}
}

// knownValue identifies a value in the corpus to match against.
type knownValue struct {
	key             string
	normalizedValue string
	reValue         *regexp.Regexp
	set             *searchset.SearchSet
}

// AddValue adds a known value to be matched against. If a value already exists
// for key, an error is returned.
func (c *Classifier) AddValue(key, value string) error {
	c.muValues.Lock()
	defer c.muValues.Unlock()
	if _, ok := c.values[key]; ok {
		return fmt.Errorf("value already registered with key %q", key)
	}
	norm := c.normalize(value)
	c.values[key] = &knownValue{
		key:             key,
		normalizedValue: norm,
		reValue:         regexp.MustCompile(norm),
	}
	return nil
}

// AddPrecomputedValue adds a known value to be matched against. The value has
// already been normalized and the SearchSet object deserialized, so no
// processing is necessary.
func (c *Classifier) AddPrecomputedValue(key, value string, set *searchset.SearchSet) error {
	c.muValues.Lock()
	defer c.muValues.Unlock()
	if _, ok := c.values[key]; ok {
		return fmt.Errorf("value already registered with key %q", key)
	}
	set.GenerateNodeList()
	c.values[key] = &knownValue{
		key:             key,
		normalizedValue: value,
		reValue:         regexp.MustCompile(value),
		set:             set,
	}
	return nil
}

// normalize a string by applying each of the registered NormalizeFuncs.
func (c *Classifier) normalize(s string) string {
	for _, fn := range c.normalizers {
		s = fn(s)
	}
	return s
}

// Match identifies the result of matching a string against a knownValue.
type Match struct {
	Name       string  // Name of knownValue that was matched
	Confidence float64 // Confidence percentage
	Offset     int     // The offset into the unknown string the match was made
	Extent     int     // The length from the offset into the unknown string
}

// Matches is a list of Match-es. This is here mainly so that the list can be
// sorted.
type Matches []*Match

func (m Matches) Len() int      { return len(m) }
func (m Matches) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m Matches) Less(i, j int) bool {
	if math.Abs(m[j].Confidence-m[i].Confidence) < math.SmallestNonzeroFloat64 {
		if m[i].Name == m[j].Name {
			if m[i].Offset > m[j].Offset {
				return false
			}
			if m[i].Offset == m[j].Offset {
				return m[i].Extent > m[j].Extent
			}
			return true
		}
		return m[i].Name < m[j].Name
	}
	return m[i].Confidence > m[j].Confidence
}

// Names returns an unsorted slice of the names of the matched licenses.
func (m Matches) Names() []string {
	var names []string
	for _, n := range m {
		names = append(names, n.Name)
	}
	return names
}

// uniquify goes through the matches and removes any that are contained within
// one with a higher confidence. This assumes that Matches is sorted.
func (m Matches) uniquify() Matches {
	type matchedRange struct {
		offset, extent int
	}

	var matched []matchedRange
	var matches Matches
OUTER:
	for _, match := range m {
		for _, mr := range matched {
			if match.Offset >= mr.offset && match.Offset <= mr.offset+mr.extent {
				continue OUTER
			}
		}
		matched = append(matched, matchedRange{match.Offset, match.Extent})
		matches = append(matches, match)
	}

	return matches
}

// NearestMatch returns the name of the known value that most closely matches
// the unknown string and a confidence percentage is returned indicating how
// confident the classifier is in the result. A percentage of "1.0" indicates
// an exact match, while a percentage of "0.0" indicates a complete mismatch.
//
// If the string is equidistant from multiple known values, it is undefined
// which will be returned.
func (c *Classifier) NearestMatch(s string) *Match {
	pq := c.nearestMatch(s)
	if pq.Len() == 0 {
		return &Match{}
	}
	return pq.Pop().(*Match)
}

// MultipleMatch tries to determine which known strings are found within an
// unknown string. This differs from "NearestMatch" in that it looks only at
// those areas within the unknown string that are likely to match. A list of
// potential matches are returned. It's up to the caller to determine which
// ones are acceptable.
func (c *Classifier) MultipleMatch(s string) (matches Matches) {
	pq := c.multipleMatch(s)
	if pq == nil {
		return matches
	}

	// A map to remove duplicate entries.
	m := make(map[Match]bool)

	for pq.Len() != 0 {
		v := pq.Pop().(*Match)
		if _, ok := m[*v]; !ok {
			m[*v] = true
			matches = append(matches, v)
		}
	}

	sort.Sort(matches)
	return matches.uniquify()
}

// possibleMatch identifies a known value and it's diffRatio to a given string.
type possibleMatch struct {
	value     *knownValue
	diffRatio float64
}

// likelyMatches is a slice of possibleMatches that can be sorted by their
// diffRatio to a given string, such that the most likely matches (based on
// length) are at the beginning.
type likelyMatches []possibleMatch

func (m likelyMatches) Len() int           { return len(m) }
func (m likelyMatches) Less(i, j int) bool { return m[i].diffRatio > m[j].diffRatio }
func (m likelyMatches) Swap(i, j int)      { m[i], m[j] = m[j], m[i] }

// nearestMatch returns a Queue of values that the unknown string may be. The
// values are compared via their Levenshtein Distance and ranked with the
// nearest match at the beginning.
func (c *Classifier) nearestMatch(unknown string) *pq.Queue {
	var mu sync.Mutex // Protect the priority queue.
	pq := pq.NewQueue(func(x, y interface{}) bool {
		return x.(*Match).Confidence > y.(*Match).Confidence
	}, nil)

	unknown = c.normalize(unknown)
	if len(unknown) == 0 {
		return pq
	}

	c.muValues.RLock()
	var likely likelyMatches
	for _, v := range c.values {
		dr := diffRatio(unknown, v.normalizedValue)
		if dr < c.MinDiffRatio {
			continue
		}
		if unknown == v.normalizedValue {
			// We found an exact match.
			pq.Push(&Match{Name: v.key, Confidence: 1.0, Offset: 0, Extent: len(unknown)})
			c.muValues.RUnlock()
			return pq
		}
		likely = append(likely, possibleMatch{value: v, diffRatio: dr})
	}
	c.muValues.RUnlock()
	sort.Sort(likely)

	var wg sync.WaitGroup
	classifyString := func(name, unknown, known string) {
		defer wg.Done()

		diffs := dmp.DiffMain(unknown, known, true)
		distance := dmp.DiffLevenshtein(diffs)
		confidence := confidencePercentage(len(unknown), len(known), distance)
		if confidence > 0.0 {
			mu.Lock()
			pq.Push(&Match{Name: name, Confidence: confidence, Offset: 0, Extent: len(unknown)})
			mu.Unlock()
		}
	}

	wg.Add(len(likely))
	for _, known := range likely {
		go classifyString(known.value.key, unknown, known.value.normalizedValue)
	}
	wg.Wait()
	return pq
}

// matcher finds all potential matches of "known" in "unknown". The results are
// placed in "queue".
type matcher struct {
	unknown     *searchset.SearchSet
	normUnknown string
	threshold   float64

	mu    sync.Mutex
	queue *pq.Queue
}

// newMatcher creates a "matcher" object.
func newMatcher(unknown string, threshold float64) *matcher {
	return &matcher{
		unknown:     searchset.New(unknown, searchset.DefaultGranularity),
		normUnknown: unknown,
		threshold:   threshold,
		queue: pq.NewQueue(func(x, y interface{}) bool {
			return x.(*Match).Confidence > y.(*Match).Confidence
		}, nil),
	}
}

// findMatches takes a known text and finds all potential instances of it in
// the unknown text. The resulting matches can then filtered to determine which
// are the best matches.
func (m *matcher) findMatches(known *knownValue) {
	var mrs []searchset.MatchRanges
	if all := known.reValue.FindAllStringIndex(m.normUnknown, -1); all != nil {
		// We found exact matches. Just use those!
		for _, a := range all {
			var start, end int
			for i, tok := range m.unknown.Tokens {
				if tok.Offset == a[0] {
					start = i
				} else if tok.Offset >= a[len(a)-1]-len(tok.Text) {
					end = i
					break
				}
			}

			mrs = append(mrs, searchset.MatchRanges{{
				SrcStart:    0,
				SrcEnd:      len(known.set.Tokens),
				TargetStart: start,
				TargetEnd:   end + 1,
			}})
		}
	} else {
		// No exact match. Perform a more thorough match.
		mrs = searchset.FindPotentialMatches(known.set, m.unknown)
	}

	var wg sync.WaitGroup
	for _, mr := range mrs {
		if !m.withinConfidenceThreshold(known.set, mr) {
			continue
		}

		wg.Add(1)
		go func(mr searchset.MatchRanges) {
			start, end := mr.TargetRange(m.unknown)
			conf := levDist(m.normUnknown[start:end], known.normalizedValue)
			if conf > 0.0 {
				m.mu.Lock()
				m.queue.Push(&Match{Name: known.key, Confidence: conf, Offset: start, Extent: end - start})
				m.mu.Unlock()
			}
			wg.Done()
		}(mr)
	}
	wg.Wait()
}

// withinConfidenceThreshold returns the Confidence we have in the potential
// match. It does this by calculating the ratio of what's matching to the
// original known text.
func (m *matcher) withinConfidenceThreshold(known *searchset.SearchSet, mr searchset.MatchRanges) bool {
	return float64(mr.Size())/float64(len(known.Tokens)) >= m.threshold
}

// multipleMatch returns a Queue of values that might be within the unknown
// string. The values are compared via their Levenshtein Distance and ranked
// with the nearest match at the beginning.
func (c *Classifier) multipleMatch(unknown string) *pq.Queue {
	normUnknown := c.normalize(unknown)
	if normUnknown == "" {
		return nil
	}

	m := newMatcher(normUnknown, c.threshold)

	c.muValues.RLock()
	var kvals []*knownValue
	for _, known := range c.values {
		kvals = append(kvals, known)
	}
	c.muValues.RUnlock()

	var wg sync.WaitGroup
	wg.Add(len(kvals))
	for _, known := range kvals {
		go func(known *knownValue) {
			if known.set == nil {
				k := searchset.New(known.normalizedValue, searchset.DefaultGranularity)
				c.muValues.Lock()
				c.values[known.key].set = k
				c.muValues.Unlock()
			}
			m.findMatches(known)
			wg.Done()
		}(known)
	}
	wg.Wait()
	return m.queue
}

// levDist runs the Levenshtein Distance algorithm on the known and unknown
// texts to measure how well they match.
func levDist(unknown, known string) float64 {
	if len(known) == 0 || len(unknown) == 0 {
		log.Printf("Zero-sized texts in Levenshtein Distance algorithm: known==%d, unknown==%d", len(known), len(unknown))
		return 0.0
	}

	// Calculate the differences between the potentially matching known
	// text and the unknown text.
	diffs := dmp.DiffMain(unknown, known, false)
	end := diffRangeEnd(known, diffs)

	// Now execute the Levenshtein Distance algorithm to see how much it
	// does match.
	distance := dmp.DiffLevenshtein(diffs[:end])
	return confidencePercentage(unknownTextLength(unknown, diffs), len(known), distance)
}

// unknownTextLength returns the length of the unknown text based on the diff range.
func unknownTextLength(unknown string, diffs []diffmatchpatch.Diff) int {
	last := len(diffs) - 1
	for ; last >= 0; last-- {
		if diffs[last].Type == diffmatchpatch.DiffEqual {
			break
		}
	}
	ulen := 0
	for i := 0; i < last+1; i++ {
		switch diffs[i].Type {
		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffDelete:
			ulen += len(diffs[i].Text)
		}
	}
	return ulen
}

// diffRangeEnd returns the end index for the "Diff" objects that constructs
// (or nearly constructs) the "known" value.
func diffRangeEnd(known string, diffs []diffmatchpatch.Diff) (end int) {
	var seen string
	for end = 0; end < len(diffs); end++ {
		if seen == known {
			// Once we've constructed the "known" value, then we've
			// reached the point in the diff list where more
			// "Diff"s would just make the Levenshtein Distance
			// less valid. There shouldn't be further "DiffEqual"
			// nodes, because there's nothing further to match in
			// the "known" text.
			break
		}
		switch diffs[end].Type {
		case diffmatchpatch.DiffEqual, diffmatchpatch.DiffInsert:
			seen += diffs[end].Text
		}
	}
	return end
}

// confidencePercentage calculates how confident we are in the result of the
// match. A percentage of "1.0" means an identical match. A confidence of "0.0"
// means a complete mismatch.
func confidencePercentage(ulen, klen, distance int) float64 {
	if ulen == 0 && klen == 0 {
		return 1.0
	}
	if ulen == 0 || klen == 0 || (distance > ulen && distance > klen) {
		return 0.0
	}
	return 1.0 - float64(distance)/float64(max(ulen, klen))
}

// diffRatio calculates the ratio of the length of s1 and s2, returned as a
// percentage of the length of the longer string. E.g., diffLength("abcd", "e")
// would return 0.25 because "e" is 25% of the size of "abcd". Comparing
// strings of equal length will return 1.
func diffRatio(s1, s2 string) float64 {
	x, y := len(s1), len(s2)
	if x == 0 && y == 0 {
		// Both strings are zero length
		return 1.0
	}
	if x < y {
		return float64(x) / float64(y)
	}
	return float64(y) / float64(x)
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}

// wsRegexp is a regexp used to identify blocks of whitespace.
var wsRegexp = regexp.MustCompile(`\s+`)

// FlattenWhitespace will flatten contiguous blocks of whitespace down to a single space.
var FlattenWhitespace NormalizeFunc = func(s string) string {
	return wsRegexp.ReplaceAllString(s, " ")
}